示例#1
0
    def clean(self):
        """
        Caution! Empties database directories and commit logs for all nodes in db.
        :return:
        """
        report(
            'Cleaning data and commitlog directories for cluster {%s}' %
            (self.name), 'warning')
        cmd = 'sudo service cassandra stop'
        for ip in self.ips:
            rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(10)

        cmd_list = [
            'rm -f ~/.__jmxcmd*',
            'sudo rm -rf %s/*' % self.data_dir,
            'sudo rm -rf %s/*' % self.commitlog_dir,
            'sudo service cassandra start',
        ]
        for ip in self.ips[:1]:
            for cmd in cmd_list:
                rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(30)

        for ip in self.ips[1:]:
            for cmd in cmd_list:
                rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(30)
        report('Status cluster {%s} \n %s' % (self.name, self.status()))
示例#2
0
    def clean(self):
        """
        Caution! Empties database directories and commit logs for all nodes in db.
        :return:
        """
        report('Cleaning data and commitlog directories for cluster {%s}' % (self.name), 'warning')
        cmd = 'sudo service cassandra stop'
        for ip in self.ips:
            rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(10)

        cmd_list = [
            'rm -f ~/.__jmxcmd*',
            'sudo rm -rf %s/*' % self.data_dir,
            'sudo rm -rf %s/*' % self.commitlog_dir,
            'sudo service cassandra start',
        ]
        for ip in self.ips[:1]:
            for cmd in cmd_list:
                rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(30)

        for ip in self.ips[1:]:
            for cmd in cmd_list:
                rpc(ip, cmd, self.username, self.password, self.key)

        time.sleep(30)
        report('Status cluster {%s} \n %s' % (self.name, self.status()))
示例#3
0
        def mass_worker():
            record_count_per_node = int(record_count / len(population_ips))
            node_start_record = start_record

            auth_string = ''
            if self.db_user:
                auth_string = '--db_user %s --db_pass %s' % (self.db_user, self.db_pass)

            for ip in population_ips:
                report('Setting mass population on cluster {%s} node {%s}.' % (self.name, ip), 'warning')

                # Clean log first.
                cmd = 'sudo rm /tmp/mass_population.log'
                rpc(ip, cmd, self.username, self.password, self.key)

                cmd = '(python ~/.geppetto/data_population.py ' \
                      '%s %s %s ' \
                      'insert ' \
                      '-r %s ' \
                      '-s %s ' \
                      '-n %s ' \
                      '-t %s ' \
                      '--replication %s ' \
                      ') > /tmp/mass_population.log &' % \
                      (ip, schema_file, auth_string,
                       record_size,
                       node_start_record,
                       record_count_per_node,
                       mgmt_object,
                       replication)

                node_start_record += record_count_per_node

                rpc(ip, cmd, self.username, self.password, self.key, no_tty=True)  # No tty so we can run as bg & disconnect.

            if not async:
                cmd = 'ps -ef | grep geppetto | grep -v grep | wc -l'
                cmd2 = 'tail -1 /tmp/mass_population.log'
                while True:
                    try:
                        report('Populating ...')

                        processes_running = 0
                        for ip in population_ips:
                            out, err = rpc(ip, cmd, self.username, self.password, self.key, suppress_output=True)
                            out2, err2 = rpc(ip, cmd2, self.username, self.password, self.key, suppress_output=True)
                            report('<%s> %s' % (ip, out2))
                            try:
                                processes_running += int(out)
                            except Exception as e:
                                report(e, 'critical')
                                raise
                        if processes_running == 0:
                            break
                    except Exception as e:
                        report(e, 'critical')
                        break

                    time.sleep(15)
示例#4
0
    def query(self, query, no_pause=False, suppress_reporting=False, retries=5):
        """
        Performs a cql query on the database.
        """
        assert(retries >= 0)

        # Format the query and make sure we have trailing ';'
        query = query.strip(' ')

        if not query:
            return

        if query[-1] != ';':
            query += ' ;'

        cluster = CassandraTestingCluster(self.ips, self.db_user, self.db_pass)
        if not cluster.connect():
            report('Error cannot connect to Cassandra cluster', 'critical')
            if not no_pause:
                response = pause_execution_for_input('Error cannot connect to Cassandra cluster.')
                if response == 'r':
                    result, success = self.query(query)
                else:
                    return '', False
            else:
                return '', False
        else:
            # Persistent retry, then prompt use for action if still error.
            i = 0
            wait_times = [0, 5, 15, 60, 60,]
            result, success = '', False
            while i <= retries:
                if not suppress_reporting:
                    report(query)
                result, success = cluster.runQuery(query)

                if success or i >= retries:
                    break

                if not suppress_reporting:
                    report(result, 'warning')
                    report(success, 'warning')

                retry_time = wait_times[min(i, len(wait_times) - 1)]
                if not suppress_reporting:
                    report('Retrying in %s seconds' % retry_time)
                time.sleep(retry_time)
                i += 1

            # If retries did not produce successful query, then prompt user for input if we allow pausing.
            if not success and not no_pause:
                response = pause_execution_for_input('Error')
                if response == 'r':  # 'retry'.
                    result, success = self.query(query, retries=0)  # Only try once on manual retries.

        cluster.disconnect()

        return result, success
示例#5
0
def do_update(cluster, target_db, schema_file, record_size, start_record, batch_size, insert_percentage, delay, batch_count, replication_factor=3, suppress_output=False):
    record_size = int(record_size)
    start_record = int(start_record)
    batch_size = int(batch_size)
    insert_percentage = int(insert_percentage)
    delay = float(delay) / 1000
    batch_count = int(batch_count)
    nr_batch = 0

    random.seed(1)

    ks_name, cf_name = getKSCFNames(target_db)
    if ks_name == None or cf_name == None:
        return

    createKeyspace(cluster, ks_name, replication_factor=replication_factor)
    createTable(cluster, ks_name, cf_name, schema_file)

    ts = TestSchema(cluster, ks_name, cf_name)
    ts.getSchema()

    while True:
        if ts.counter_table:
            batch = BatchStatement(batch_type = BatchType.COUNTER)
        else:
            batch = BatchStatement()
        stat_str = ''
        for i in range(batch_size):
            if start_record <= 0 or random.randrange(100) <= insert_percentage:
                # insert case
                record_num = start_record
                query = ts.getInsertQuerywithRandomData(record_num, record_size)
                stat_str += 'I(%d) ' % record_num

            else:
                record_num = random.randrange(0, start_record)
                if random.randrange(100) <= 70:  # 70% update
                    if not ts.counter_table:
                        query = ts.getUpdateQuery(record_num)
                    else:
                        query = ts.getInsertQuerywithRandomData(record_num, 0)

                    stat_str += 'U(%d) ' % record_num
                else:                           # 30% deletion
                    query = ts.getDeleteQuery(record_num)
                    stat_str += 'D(%d) ' % record_num
            if not suppress_output:
                report(stat_str)
            batch.add(query)
            start_record += 1

        #print stat_str
        cluster.session.execute(batch)
        nr_batch += 1
        if nr_batch == batch_count:
            if batch_count >= 0:
                break
        time.sleep(delay)
示例#6
0
    def single_random_db_failure(self,
                                 wait_time_min=0,
                                 run_time_min=10,
                                 time_length_of_failure=5,
                                 max_failure_repeats=1,
                                 randomness_time_injection=90):
        """
        Shuts down a random cassandra node for a given time with some randomness thrown in for timing.
        """
        assert (wait_time_min >= 0)
        assert (run_time_min >= 0.1)
        assert (time_length_of_failure >= 0.2)
        assert (max_failure_repeats > 0)

        self.cassandra.status()

        start_time = time.time()
        time.sleep(60 * wait_time_min)

        pick = pick_x_different_num(1, 0, len(self.cassandra.ips) - 1)[0]
        for _ in xrange(max_failure_repeats):
            time.sleep(random.randint(
                0, randomness_time_injection))  # Randomize time that db fails.
            currently_down = collections.deque()

            try:
                # Bring down the db node for some time.
                currently_down.append(pick)
                add_test_note('%s' % (self.cassandra.ips[pick]))
                self.cassandra.db_stop(self.cassandra.ips[pick])
                self.cassandra.status()  # Let's see the db state.
                time.sleep(60 * random.randint(time_length_of_failure * 3 / 4,
                                               time_length_of_failure))

                # Bring db node back up.
                self.cassandra.db_start(self.cassandra.ips[pick])
                currently_down.popleft()
                time.sleep(20)
                self.cassandra.status()

            except (KeyboardInterrupt, SystemExit) as e:
                # Do some clean up (restore db nodes) and some reporting, then re-raise exception.
                report('Exit detected ... restoring db state', 'critical')
                for i in currently_down:
                    self.cassandra.db_start(self.cassandra.ips[pick])
                time.sleep(20)
                self.cassandra.status()  # Logs will capture output.
                global global_vars
                global_vars['test_status'] = 'Aborted'
                add_test_note(e)
                raise e

            # Exit failure loop if we've reached max time.
            if (time.time() + wait_time_min * 60 - start_time >=
                    run_time_min * 60):
                break
示例#7
0
        def delta_worker():
            # Loop every 5 minutes and reinitialize delta.
            while self.do_delta_population:
                # Stop previous populations, in the case they are still going.
                rpc(workload_ip, cmd1, self.username, self.password, self.key)
                time.sleep(2)

                # Start new batch of populations.
                rpc(workload_ip, cmd2, self.username, self.password, self.key, no_tty=True)  # No tty so we can run as bg & disconnect.
                report('{%s} delta population set on node %s.' % (mgmt_object, workload_ip))
                time.sleep(60 * LOOP_MIN)  # Sleep LOOP_MIN min, allow delta to complete and settle, then cycle again. (A more dependable way)
示例#8
0
    def _deliver_payload(self):
        """
        Delivers population scripts and other goodies to the cassandra source cluster. Most stored in ~/.geppetto/
        """
        common_script_path = '%s/common/common.py' % (
            common.common.global_vars['geppetto_install_dir'])
        population_script_path = '%s/db_utils/cassandra_utils/data_population.py' % (
            common.common.global_vars['geppetto_install_dir'])
        schema_folder_path = '%s/db_utils/cassandra_utils/schema' % (
            common.common.global_vars['geppetto_install_dir'])
        for ip in self.ips:
            report('Updating Geppetto payload on {%s}.' % ip)
            to_path = '%s@%s:~/.geppetto/' % (self.username, ip)
            # rpc(ip, 'rm -rf ~/.geppetto', self.username, self.password, self.key, suppress_output=True)
            rpc(ip,
                'mkdir -p ~/.geppetto/common',
                self.username,
                self.password,
                self.key,
                suppress_output=True)
            rpc(ip,
                'touch ~/.geppetto/common/__init__.py',
                self.username,
                self.password,
                self.key,
                suppress_output=True)
            scp(common_script_path,
                '%s/common/' % to_path,
                self.password,
                self.key,
                suppress_output=True)
            scp(population_script_path,
                to_path,
                self.password,
                self.key,
                suppress_output=True)
            scp(schema_folder_path,
                to_path,
                self.password,
                self.key,
                is_dir=True,
                suppress_output=True)

        self.payload = True
        return True
示例#9
0
    def _deliver_payload(self):
        """
        Delivers population scripts and other goodies to the cassandra source cluster. Most stored in ~/.geppetto/
        """
        common_script_path = '%s/common/common.py' % (common.common.global_vars['geppetto_install_dir'])
        population_script_path = '%s/db_utils/cassandra_utils/data_population.py' % (common.common.global_vars['geppetto_install_dir'])
        schema_folder_path = '%s/db_utils/cassandra_utils/schema' % (common.common.global_vars['geppetto_install_dir'])
        for ip in self.ips:
            report('Updating Geppetto payload on {%s}.' % ip)
            to_path = '%s@%s:~/.geppetto/' % (self.username, ip)
            # rpc(ip, 'rm -rf ~/.geppetto', self.username, self.password, self.key, suppress_output=True)
            rpc(ip, 'mkdir -p ~/.geppetto/common', self.username, self.password, self.key, suppress_output=True)
            rpc(ip, 'touch ~/.geppetto/common/__init__.py', self.username, self.password, self.key, suppress_output=True)
            scp(common_script_path, '%s/common/' % to_path, self.password, self.key, suppress_output=True)
            scp(population_script_path, to_path, self.password, self.key, suppress_output=True)
            scp(schema_folder_path, to_path, self.password, self.key, is_dir=True, suppress_output=True)

        self.payload = True
        return True
示例#10
0
def main():
    args = parse_args()

    # Import the test file.
    try:
        test_file_name = args.test_file
        test_file = test_file_name[:-3].replace('/', '.')
        mod = __import__(test_file, fromlist=['TestRun'])
        TestRun = getattr(mod, 'TestRun')
    except:
        report('Unable to load TestRun() from file: %s' % args.test_file,
               'critical',
               no_date=True)
        print(traceback.print_exc())
        sys.exit(1)

    # Import the config file.
    try:
        config_file_name = args.config
        config_file = config_file_name[:-3].replace('/', '.')
        mod = __import__(config_file, fromlist=['CONFIG_DICT'])
        config_dict = getattr(mod, 'CONFIG_DICT')
    except:
        report("Unable to import the config file: %s" % args.config,
               'critical',
               no_date=True)
        print(traceback.print_exc())
        sys.exit(1)

    do_welcome()

    class GeppettoExecutableTest(TestRun):
        def __init__(self):
            Geppetto.__init__(self)
            TestRun.set_init_params(self, config_dict, args, test_file_name,
                                    config_file_name)

        @capture_exception_and_abort
        def run(self):
            TestRun.run(self)

    g = GeppettoExecutableTest()
    g.run()
示例#11
0
        def delta_worker():
            # Loop every 5 minutes and reinitialize delta.
            while self.do_delta_population:
                # Stop previous populations, in the case they are still going.
                rpc(workload_ip, cmd1, self.username, self.password, self.key)
                time.sleep(2)

                # Start new batch of populations.
                rpc(workload_ip,
                    cmd2,
                    self.username,
                    self.password,
                    self.key,
                    no_tty=True)  # No tty so we can run as bg & disconnect.
                report('{%s} delta population set on node %s.' %
                       (mgmt_object, workload_ip))
                time.sleep(
                    60 * LOOP_MIN
                )  # Sleep LOOP_MIN min, allow delta to complete and settle, then cycle again. (A more dependable way)
示例#12
0
def main():
    args = parse_args()

    if not os.path.exists(args.schema_file):
        sys.exit(-1)

    if args.db_user:
        cluster = CassandraTestingCluster(args.ip_list, db_user=args.db_user, db_pass=args.db_pass)
    else:
        cluster = CassandraTestingCluster(args.ip_list)

    if not cluster.connect():
        report('Cannot connect to cassandra cluster.', 'error')
        sys.exit(-1)

    try:
        if args.command == 'insert':
            do_insert(cluster, args.target_db, args.schema_file, args.record_size, args.start_record, args.record_count, args.uuid4, replication_factor=args.replication)
        elif args.command == 'update':
            do_update(cluster, args.target_db, args.schema_file, args.record_size, args.start_record, args.batch_size, args.insert_percentage, args.delay, args.batch_count, replication_factor=args.replication)
        else:
            report('Unrecognized command.\n')

    except Exception as e:
        report('%s\n' % e)
        sys.exit()

    cluster.disconnect()
示例#13
0
文件: run.py 项目: datosio/geppetto
def main():
    args = parse_args()

    # Import the test file.
    try:
        test_file_name = args.test_file
        test_file = test_file_name[:-3].replace('/', '.')
        mod = __import__(test_file, fromlist=['TestRun'])
        TestRun = getattr(mod, 'TestRun')
    except:
        report('Unable to load TestRun() from file: %s' % args.test_file, 'critical', no_date=True)
        print(traceback.print_exc())
        sys.exit(1)

    # Import the config file.
    try:
        config_file_name = args.config
        config_file = config_file_name[:-3].replace('/', '.')
        mod = __import__(config_file, fromlist=['CONFIG_DICT'])
        config_dict = getattr(mod, 'CONFIG_DICT')
    except:
        report("Unable to import the config file: %s" % args.config, 'critical', no_date=True)
        print(traceback.print_exc())
        sys.exit(1)

    do_welcome()

    class GeppettoExecutableTest(TestRun):
        def __init__(self):
            Geppetto.__init__(self)
            TestRun.set_init_params(self, config_dict, args, test_file_name, config_file_name)

        @capture_exception_and_abort
        def run(self):
            TestRun.run(self)

    g = GeppettoExecutableTest()
    g.run()
示例#14
0
    def insert(self,
               mgmt_object,
               schema_file,
               record_size,
               start_record,
               record_count,
               uuid4=None,
               suppress_reporting=False,
               cluster=None):
        """
        Does batch inserts into db from geppetto node.
        """
        if not cluster:
            cluster = CassandraTestingCluster(self.ips, self.db_user,
                                              self.db_pass)
            if not cluster.connect():
                report('ERROR: cannot connect to Cassandra cluster',
                       'critical')
                sys.exit(-1)

        if uuid4:
            if not suppress_reporting:
                report('%s do_insert(%s, %s, %s, %s, %s, %s, %s)' %
                       (self.name, 'cluster', mgmt_object, schema_file,
                        record_size, start_record, record_count, uuid4))
            do_insert(cluster,
                      mgmt_object,
                      schema_file,
                      record_size,
                      start_record,
                      record_count,
                      uuid4,
                      suppress_output=suppress_reporting)
        else:
            if not suppress_reporting:
                report('%s do_insert(%s, %s, %s, %s, %s, %s)' %
                       (self.name, 'cluster', mgmt_object, schema_file,
                        record_size, start_record, record_count))
            do_insert(cluster,
                      mgmt_object,
                      schema_file,
                      record_size,
                      start_record,
                      record_count,
                      suppress_output=suppress_reporting)

        if not cluster:
            cluster.disconnect()
示例#15
0
    def insert(self, mgmt_object, schema_file, record_size, start_record, record_count, uuid4=None, suppress_reporting=False, cluster=None):
        """
        Does batch inserts into db from geppetto node.
        """
        if not cluster:
            cluster = CassandraTestingCluster(self.ips, self.db_user, self.db_pass)
            if not cluster.connect():
                report('ERROR: cannot connect to Cassandra cluster', 'critical')
                sys.exit(-1)

        if uuid4:
            if not suppress_reporting : report('%s do_insert(%s, %s, %s, %s, %s, %s, %s)' % (self.name, 'cluster', mgmt_object, schema_file, record_size, start_record, record_count, uuid4))
            do_insert(cluster, mgmt_object, schema_file, record_size, start_record, record_count, uuid4, suppress_output=suppress_reporting)
        else:
            if not suppress_reporting : report('%s do_insert(%s, %s, %s, %s, %s, %s)' % (self.name, 'cluster', mgmt_object, schema_file, record_size, start_record, record_count))
            do_insert(cluster, mgmt_object, schema_file, record_size, start_record, record_count, suppress_output=suppress_reporting)

        if not cluster:
            cluster.disconnect()
示例#16
0
 def run(self):
     report("Hello World!")
示例#17
0
    def random_node_failures(self,
                             wait_time_min=0,
                             run_time_min=10,
                             max_num_failed=1,
                             max_failure_repeats=1):
        """
        Simulates a node failure via rebooting a node. # TODO: (Aaron) currently assumes that node reboots to working condition. (need mounts in fstab and firewalls down and cass start as service.)
        """
        assert (max_num_failed > 0)
        report(self.cassandra.status())

        start_time = time.time()
        time.sleep(60 * wait_time_min)
        for _ in xrange(max_failure_repeats):
            picks = pick_x_different_num(max_num_failed, 0,
                                         len(self.cassandra.ips) - 1)
            currently_down = collections.deque()

            try:
                # First bring down those db's with some randomness thrown in.
                for i in xrange(len(picks)):
                    currently_down.append(i)
                    self.cassandra.node_reboot(
                        self.cassandra.ips[picks[i]]
                    )  # TODO: (Aaron) Let's do this with a ifdown etc like above.
                    time.sleep(random.randint(
                        0,
                        30))  # TODO: (Aaron) Can make this more sophisticated.

                # Let's stay advised with what's down.
                report(self.cassandra.status())

                # This is for future node failure implementation when we have a way to reboot like wake on lan.
                # Let them be down for a random period ... but long enough to reboot.
                time.sleep(random.randint(
                    60,
                    60 * 2))  # TODO: (Aaron) Can make this more sophisticated.

                # Now bring back up, with some randomness thrown in.
                for i in xrange(len(picks)):
                    self.cassandra.node_restore(self.cassandra.ips[picks[i]])
                    currently_down.popleft()
                    time.sleep(random.randint(
                        0,
                        30))  # TODO: (Aaron) Can make this more sophisticated.

                time.sleep(60)  # Need to let Nodes rejoin cluster properly.
                # Let's stay advised with what's up again.
                report(self.cassandra.status())

            except (KeyboardInterrupt, SystemExit) as e:
                # Do some clean up (restore db nodes) and some reporting, then re-raise exception.
                report('Exit detected ... restoring db state', 'critical')
                for i in currently_down:
                    self.cassandra.db_start(self.cassandra.ips[picks[i]])
                self.cassandra.status()  # Logs will capture output.
                global global_vars
                global_vars['test_status'] = 'Aborted'
                add_test_note(e)
                raise e

            # Exit failure loop if we've reached max time.
            if (time.time() + wait_time_min * 60 - start_time >=
                    60 * run_time_min):
                break
示例#18
0
    def random_db_failures(self,
                           wait_time_min=0,
                           run_time_min=10,
                           max_num_failed=1,
                           max_failure_repeats=1):
        """
        Shuts down cassandra nodes for given time, in a random pattern within specifications.
        """
        assert (max_num_failed > 0)
        assert (run_time_min >= 0.1)
        assert (max_num_failed > 0)
        assert (max_failure_repeats > 0)

        self.cassandra.status()  # Logs will capture output.

        start_time = time.time()
        time.sleep(60 * wait_time_min)
        for _ in xrange(max_failure_repeats):
            picks = pick_x_different_num(max_num_failed, 0,
                                         len(self.cassandra.ips) - 1)

            currently_down = collections.deque()

            try:
                # First bring down those db's with some randomness thrown in.
                for i in xrange(len(picks)):
                    currently_down.append(i)
                    self.cassandra.db_stop(self.cassandra.ips[picks[i]])
                    time.sleep(random.randint(
                        0,
                        60))  # TODO: (Aaron) Can make this more sophisticated.

                # Let them be down for a random period.
                time.sleep(random.randint(
                    0,
                    60 * 2))  # TODO: (Aaron) Can make this more sophisticated.

                # Let's stay advised with what's down.
                self.cassandra.status()  # Logs will capture output.

                # Now bring back up, with some randomness thrown in.
                for i in xrange(len(picks)):
                    self.cassandra.db_start(self.cassandra.ips[picks[i]])
                    currently_down.popleft()
                    time.sleep(random.randint(
                        0,
                        30))  # TODO: (Aaron) Can make this more sophisticated.

                time.sleep(20)  # Need to let Nodes rejoin cluster properly.
                # Let's stay advised with what's up again.
                self.cassandra.status()  # Logs will capture output.

                # Sleep random time before next cycle.
                time.sleep(0)

            except (KeyboardInterrupt, SystemExit) as e:
                # Do some clean up (restore db nodes) and some reporting, then re-raise exception.
                report('Exit detected ... restoring db state', 'critical')
                for i in currently_down:
                    self.cassandra.db_start(self.cassandra.ips[picks[i]])
                time.sleep(20)
                self.cassandra.status()  # Logs will capture output.
                global global_vars
                global_vars['test_status'] = 'Aborted'
                add_test_note(e)

                raise e

            # Exit failure loop if we've reached max time.
            if (time.time() + wait_time_min * 60 - start_time >=
                    run_time_min * 60):
                break
示例#19
0
        def mass_worker():
            record_count_per_node = int(record_count / len(population_ips))
            node_start_record = start_record

            auth_string = ''
            if self.db_user:
                auth_string = '--db_user %s --db_pass %s' % (self.db_user,
                                                             self.db_pass)

            for ip in population_ips:
                report(
                    'Setting mass population on cluster {%s} node {%s}.' %
                    (self.name, ip), 'warning')

                # Clean log first.
                cmd = 'sudo rm /tmp/mass_population.log'
                rpc(ip, cmd, self.username, self.password, self.key)

                cmd = '(python ~/.geppetto/data_population.py ' \
                      '%s %s %s ' \
                      'insert ' \
                      '-r %s ' \
                      '-s %s ' \
                      '-n %s ' \
                      '-t %s ' \
                      '--replication %s ' \
                      ') > /tmp/mass_population.log &' % \
                      (ip, schema_file, auth_string,
                       record_size,
                       node_start_record,
                       record_count_per_node,
                       mgmt_object,
                       replication)

                node_start_record += record_count_per_node

                rpc(ip,
                    cmd,
                    self.username,
                    self.password,
                    self.key,
                    no_tty=True)  # No tty so we can run as bg & disconnect.

            if not async:
                cmd = 'ps -ef | grep geppetto | grep -v grep | wc -l'
                cmd2 = 'tail -1 /tmp/mass_population.log'
                while True:
                    try:
                        report('Populating ...')

                        processes_running = 0
                        for ip in population_ips:
                            out, err = rpc(ip,
                                           cmd,
                                           self.username,
                                           self.password,
                                           self.key,
                                           suppress_output=True)
                            out2, err2 = rpc(ip,
                                             cmd2,
                                             self.username,
                                             self.password,
                                             self.key,
                                             suppress_output=True)
                            report('<%s> %s' % (ip, out2))
                            try:
                                processes_running += int(out)
                            except Exception as e:
                                report(e, 'critical')
                                raise
                        if processes_running == 0:
                            break
                    except Exception as e:
                        report(e, 'critical')
                        break

                    time.sleep(15)
示例#20
0
    def single_random_node_failure(self,
                                   wait_time_min=0,
                                   run_time_min=10,
                                   time_length_of_failure=5,
                                   max_failure_repeats=1,
                                   randomness_time_injection=90):
        """
        Shuts down a random cassandra node for a given time with some randomness thrown in for timing.
        """
        assert (wait_time_min >= 0)
        assert (run_time_min >= 0.1)
        assert (time_length_of_failure >= 0.2)
        assert (max_failure_repeats > 0)
        assert (randomness_time_injection >= 0)

        self.cassandra.status()

        start_time = time.time()
        time.sleep(60 * wait_time_min)

        pick = pick_x_different_num(1, 0, len(self.cassandra.ips) - 1)[0]
        for _ in xrange(max_failure_repeats):
            time.sleep(random.randint(
                0, randomness_time_injection))  # Randomize time that db fails.
            currently_down = collections.deque()

            try:
                # Bring down the db node for some time.
                currently_down.append(pick)

                try:
                    note = ''
                    add_test_note(note)
                    rpc(
                        self.cassandra.ips[pick],
                        '(nohup sudo ifdown eth0; sleep %s ; sudo ifup eth0 ; ) > /tmp/datos_failure.log &'
                        % (time_length_of_failure * 60),
                        self.cassandra.username, self.cassandra.password,
                        self.cassandra.key)
                except:
                    report('Could not connect to node {%s}.' % db.ips[pick],
                           'warning')

                self.cassandra.status()  # Let's see the db state.
                time.sleep(60 * time_length_of_failure + 60)

                # Bring db node back up.
                self.cassandra.node_restore(
                    self.cassandra.ips[pick]
                )  # Currently we don't have good way to restore so this does nothing.
                currently_down.popleft()
                time.sleep(20)
                self.cassandra.status()

            except (KeyboardInterrupt, SystemExit) as e:
                # Do some clean up (restore db nodes) and some reporting, then re-raise exception.
                report('Exit detected ... restoring db state', 'critical')
                for i in currently_down:
                    self.cassandra.node_restore(db.ips[pick])
                time.sleep(20)
                self.cassandra.status()  # Logs will capture output.
                global global_vars
                global_vars['test_status'] = 'Aborted'
                add_test_note(e)
                raise e

            # Exit failure loop if we've reached max time.
            if (time.time() + wait_time_min * 60 - start_time >=
                    run_time_min * 60):
                break
示例#21
0
    def query(self,
              query,
              no_pause=False,
              suppress_reporting=False,
              retries=5):
        """
        Performs a cql query on the database.
        """
        assert (retries >= 0)

        # Format the query and make sure we have trailing ';'
        query = query.strip(' ')

        if not query:
            return

        if query[-1] != ';':
            query += ' ;'

        cluster = CassandraTestingCluster(self.ips, self.db_user, self.db_pass)
        if not cluster.connect():
            report('Error cannot connect to Cassandra cluster', 'critical')
            if not no_pause:
                response = pause_execution_for_input(
                    'Error cannot connect to Cassandra cluster.')
                if response == 'r':
                    result, success = self.query(query)
                else:
                    return '', False
            else:
                return '', False
        else:
            # Persistent retry, then prompt use for action if still error.
            i = 0
            wait_times = [
                0,
                5,
                15,
                60,
                60,
            ]
            result, success = '', False
            while i <= retries:
                if not suppress_reporting:
                    report(query)
                result, success = cluster.runQuery(query)

                if success or i >= retries:
                    break

                if not suppress_reporting:
                    report(result, 'warning')
                    report(success, 'warning')

                retry_time = wait_times[min(i, len(wait_times) - 1)]
                if not suppress_reporting:
                    report('Retrying in %s seconds' % retry_time)
                time.sleep(retry_time)
                i += 1

            # If retries did not produce successful query, then prompt user for input if we allow pausing.
            if not success and not no_pause:
                response = pause_execution_for_input('Error')
                if response == 'r':  # 'retry'.
                    result, success = self.query(
                        query, retries=0)  # Only try once on manual retries.

        cluster.disconnect()

        return result, success
示例#22
0
def do_insert(cluster, target_db, schema_file, record_size, start_record, record_count, uuid4=None, replication_factor=3, suppress_output=False):
    record_size = int(record_size)
    record_num = int(start_record)
    record_count = int(record_count)
    end_record = record_num + record_count
    inserted_record = 0

    #random.seed(0)

    ks_name, cf_name = getKSCFNames(target_db)
    if ks_name == None or cf_name == None:
        return

    createKeyspace(cluster, ks_name, replication_factor=replication_factor)
    createTable(cluster, ks_name, cf_name, schema_file)

    ts = TestSchema(cluster, ks_name, cf_name)
    ts.getSchema()

    if ts.counter_table:
        batch = BatchStatement(batch_type = BatchType.COUNTER)
    else:
        batch = BatchStatement()

    i = 0
    while record_num < end_record:
        if uuid4:
            query = ts.getInsertQuerywithRandomData(record_num, record_size, uuid4)
        else:
            query = ts.getInsertQuerywithRandomData(record_num, record_size)

        if i == 0 and not suppress_output:
            report(query)

        batch.add(query)

        record_num += 1
        inserted_record += 1

        if (inserted_record % 100) == 0 or record_num == end_record:
            msg = '\rInserting %s %8d / %8d (%3d %%)' % (target_db, inserted_record, record_count,
                                                      inserted_record * 100 / record_count)
            #sys.stdout.write(msg + '\n') ; sys.stdout.flush()
            if not suppress_output:
                report(msg)
            try:
                cluster.session.execute(batch)
            except Exception as e:
                print("\n**** Detected Exception ****")
                print(e)
                print('\n')
                p = subprocess.Popen('nodetool status', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell= True)
                out, err = p.communicate()

                if "Failed to connect" in err:  # This node's cassandra is down
                    print("**** Restarting Cassandra ****")
                    os.system('sudo service cassandra start')
                    print('**** This Node crashed, sleeping for 3 minutes to reduce load. ****\n')
                    time.sleep(60*3)
                    print("")
                elif 'DN' in out:
                    print('**** Another Node crashed, sleeping for 3 minutes to reduce load. ****\n')
                    time.sleep(60*3)
                else:
                    print("**** Sleeping for 3 minutes to reduce load. ****\n")
                    time.sleep(60*3)

            if ts.counter_table:
                batch = BatchStatement(batch_type = BatchType.COUNTER)
            else:
                batch = BatchStatement()

        i += 1
示例#23
0
 def run(self):
     report("Hello World!")