def check_core_dumps(self): dumps_per_host = self.remote.detect_core_dumps() core_dumps = { host: dumps for host, dumps in dumps_per_host.items() if dumps } if core_dumps: logger.interrupt(pretty_dict(core_dumps))
def run(self): # pillowfight performs batch_size operations, cycle_times. args = [ "cbc-pillowfight", "--spec", "couchbase://{host}:{port}/{bucket}".format(host=self.host, port=self.port, bucket=self.bucket), "--password", self.password, "--batch-size", str(self.batch_size), "--num-items", str(self.num_items), "--num-threads", str(self.num_threads), "--num-cycles", str(self.num_cycles), "--min-size", str(self.min_item_size), "--max-size", str(self.max_item_size), "--set-pct", str(self.set_pcnt), # Don't use an explicit populate phase, we just measure "live" # load. "--no-population", ] try: logger.info("Starting Pillowfight as: '" + " ".join(args) + "'") subprocess.check_output(args, stderr=subprocess.STDOUT) logger.info("Finished Pillowfight") except subprocess.CalledProcessError as e: logger.interrupt("Pillowfight failed to run - output: " + e.output) raise
def consume(self): _, password = self.cluster_spec.rest_credentials for master in self.cluster_spec.yield_masters(): host = master.split(':')[0] for bucket in self.test_config.buckets: logger.info( 'Reading data via UPR from {}/{}'.format(host, bucket) ) upr_client = UprClient(host=host, port=11210) upr_client.sasl_auth_plain(username=bucket, password=password) mcd_client = MemcachedClient(host=host, port=11210) mcd_client.sasl_auth_plain(user=bucket, password=password) op = upr_client.open_producer("stream") response = op.next_response() if response['status'] != SUCCESS: logger.interrupt('Failed to open producer') for vb in range(1024): vb_stats = mcd_client.stats('vbucket-seqno {}'.format(vb)) uuid = long(vb_stats['vb_{}:uuid'.format(vb)]) high_seqno = long(vb_stats['vb_{}:high_seqno'.format(vb)]) op = upr_client.stream_req(vb=vb, flags=0, start_seqno=0, end_seqno=high_seqno, vb_uuid=uuid, high_seqno=high_seqno) while op.has_response(): response = op.next_response() if response['opcode'] != CMD_STREAM_REQ: break upr_client.close_stream(vbucket=vb) upr_client.shutdown()
def __init__(self, *args): super(IndexTest, self).__init__(*args) index_type = self.test_config.index_settings.index_type if index_type is None: logger.interrupt('Missing index_type param') self.ddocs = ViewGenDev().generate_ddocs(index_type)
def __init__(self, options): if 'size' in options: logger.interrupt( "The document `size` may only be set in the [load] " "and not in the [access] section") super(AccessSettings, self).__init__(options)
def find_package(self): for filename in self.get_expected_filenames(): for location in LOCATIONS: url = '{}{}'.format(location.format(**self.build.__dict__), filename) if self.is_exist(url): return filename, url logger.interrupt('Target build not found')
def __init__(self, fname): logger.info('Reading experiment file: {}'.format(fname)) if not os.path.isfile(fname): logger.interrupt('File doesn\'t exist: {}'.format(fname)) else: self.name = os.path.splitext(os.path.basename(fname))[0] with open(fname) as fh: self.template = json.load(fh)
def download(self): """Download and save a copy of the specified package.""" if self.remote.package == 'rpm': logger.info('Saving a local copy of {}'.format(self.url)) with open('couchbase.rpm', 'wb') as fh: resp = requests.get(self.url) fh.write(resp.content) else: logger.interrupt('Unsupported package format')
def run(self): processes = [Process(target=self.task, args=(x,)) for x in range(self.workers)] for p in processes: p.start() for p in processes: p.join() if p.exitcode: logger.interrupt('Worker finished with non-zero exit code')
def __init__(self, target_uri, prefix): params = urlparse(target_uri) if not params.hostname or not params.port or not params.path: logger.interrupt('Invalid connection URI') self.node = '{}:{}'.format(params.hostname, params.port) self.bucket = params.path[1:] self.password = params.password or '' self.prefix = prefix
def interrupt(request, *args, **kargs): for i in range(MAX_RETRY): try: return request(*args, **kargs) except (requests.ConnectionError, InternalServerError) as e: logger.info(e) time.sleep(RETRY_DELAY) continue logger.interrupt("Failed after {} tries.".format(MAX_RETRY))
def _parse_ddocs(view_settings, options): ddocs = {} if view_settings.indexes is None: logger.interrupt('Missing indexes param') for index in view_settings.indexes: ddoc_name, ddoc = index.split('::', 1) ddocs[ddoc_name] = json.loads(ddoc) if options: ddocs[ddoc_name]['options'] = options return ddocs
def _get_db(self, db_name): try: existing_dbs = self.seriesly.list_dbs() except ConnectionError as e: logger.interrupt("seriesly not available: {}".format(e)) else: if db_name not in existing_dbs: logger.info("Creating a new database: {}".format(db_name)) self.seriesly.create_db(db_name) return self.seriesly[db_name]
def clone_repo(repo): CLONE_TIMEOUT = 60 cmd = "git clone {}".format(repo) logger.info("Running {}".format(cmd)) try: run(cmd, pty=False, timeout=CLONE_TIMEOUT) except fabric.exceptions.CommandTimeout: logger.interrupt( "Failed to clone forestdb under {} seconds".format( CLONE_TIMEOUT))
def __exit__(self, exc_type, exc_val, exc_tb): if self.test_config.test_case.use_workers: self.worker_manager.terminate() if exc_type != exc.KeyboardInterrupt and '--nodebug' not in sys.argv: self.debug() self.check_core_dumps() for master in self.cluster_spec.yield_masters(): if not self.rest.is_balanced(master): logger.interrupt('Rebalance failed') self.check_failover(master)
def check_failover(self, master): if hasattr(self, 'rebalance_settings'): if self.rebalance_settings.failover or \ self.rebalance_settings.graceful_failover: return num_failovers = self.rest.get_failover_counter(master) if num_failovers: logger.interrupt( 'Failover happened {} time(s)'.format(num_failovers) )
def __exit__(self, exc_type, exc_val, exc_tb): failure = self.debug() self.tear_down() if exc_type == KeyboardInterrupt: logger.warn('The test was interrupted') return True if failure: logger.interrupt(failure)
def monitor_analytics_node_active(self, host): logger.info('Monitoring analytics node health') for retry in range(self.MAX_RETRY): active = self.analytics_node_active(host) if active: break else: time.sleep(self.POLLING_INTERVAL) else: logger.interrupt('Analytcs node still not health: {}'.format(host))
def find_package(self): for filename, url in self.get_expected_locations(): try: status_code = requests.head(url).status_code except ConnectionError: pass else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found')
def find_package(self): filename = 'couchbase-sync-gateway_{}_x86_64.rpm'.format(self.version) url = '{}{}'.format(self.CBFS, filename) try: status_code = requests.head(url).status_code except requests.exceptions.ConnectionError: pass else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found - {}'.format(url))
def parse(self, fname: str, override=None): logger.info('Reading configuration file: {}'.format(fname)) if not os.path.isfile(fname): logger.interrupt("File doesn't exist: {}".format(fname)) self.config.optionxform = str self.config.read(fname) basename = os.path.basename(fname) self.name = os.path.splitext(basename)[0] if override is not None: self.override(override)
def find_manifest(): for url in iter_urls(): try: logger.debug("Trying {}".format(url)) status_code = requests.head(url).status_code except ConnectionError: continue else: if status_code == 200: logger.info('Found "{}"'.format(url)) return url logger.interrupt("Cannot find the manifest for given version")
def read_cfg(self, config): if not os.path.isfile(config): logger.interrupt("File doesn\'t exist: {}".format(config)) logger.info("Reading configuration file: {}".format(config)) with open(config) as fh: try: for option, value in json.load(fh).items(): setattr(self, option, value) except ValueError as e: logger.interrupt("Error reading config: {}".format(e)) else: logger.info("Configuration file successfully parsed")
def retry(self, path, server=None, port=8091): time.sleep(self.interval) for node in self.nodes: if self._check_node(node): self.master_node = node self.nodes = list(self.get_nodes()) break else: logger.interrupt("Failed to find at least one node") if server not in self.nodes: raise RuntimeError("Bad node {}".format(server or "")) else: return self.get_http(path, server, port)
def main(): failures = defaultdict(dict) for file_name in glob.iglob('./*.zip'): panic_files, crash_files = validate_logs(file_name) if panic_files: failures['panics'][file_name] = panic_files if crash_files: failures['crashes'][file_name] = crash_files if failures: logger.interrupt("Following failures found: {}".format( pretty_dict(failures)))
def parse(self, fname, override=()): logger.info('Reading configuration file: {}'.format(fname)) if not os.path.isfile(fname): logger.interrupt('File doesn\'t exist: {}'.format(fname)) self.config.optionxform = str self.config.read(fname) for section, option, value in override: if not self.config.has_section(section): self.config.add_section(section) self.config.set(section, option, value) basename = os.path.basename(fname) self.name = os.path.splitext(basename)[0]
def find_package(self): for filename in self.get_expected_filenames(): for base in (self.LATEST_BUILDS, self.SHERLOCK_BUILDS, self.CBFS): url = '{}{}'.format(base, filename) try: status_code = requests.head(url).status_code except ConnectionError: continue else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found')
def wait_for_servers(self): for retry in range(self.MAX_RETRY): logger.info('Waiting for all servers to be available') time.sleep(self.POLLING_INTERVAL_MACHINE_UP) for server in self.cluster_spec.servers: if not self.remote.is_up(server): break else: logger.info('All nodes are up') return logger.interrupt('Some nodes are still down')
def main(): failures = defaultdict(dict) for file_name in glob.iglob('./*.zip'): panic_files, crash_files = validate_logs(file_name) if panic_files: failures['panics'][file_name] = panic_files if crash_files: failures['crashes'][file_name] = crash_files if failures: logger.interrupt( "Following failures found: {}".format(pretty_dict(failures)))
def find_package(self): for filename in self.get_expected_filenames(): for base in (self.LATEST_BUILDS, self.CBFS): url = '{}{}'.format(base, filename) try: status_code = requests.head(url).status_code except requests.exceptions.ConnectionError: continue else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found')
def drop_db(self, cluster=None, server=None, bucket=None, index=None, collector=None): db_name = self.build_dbname(cluster, server, bucket, index, collector) try: existing_dbs = self.seriesly.list_dbs() except ConnectionError as e: logger.interrupt("seriesly not available: {}".format(e)) else: if db_name not in existing_dbs: logger.info("DB not present: {}".format(db_name)) return logger.info("Dropping DB: {}".format(db_name)) self.seriesly.drop_db(db_name) return
def monitor_analytics_node_active(self, host): logger.info('Monitoring analytics node health') for retry in range(self.MAX_RETRY): active = self.analytics_node_active(host) if active: break else: time.sleep(self.POLLING_INTERVAL) else: logger.interrupt('Analytics node still not healthy: {}'.format( host ))
def install(self): num_gateways = len(self.cluster_spec.gateways) num_gateloads = len(self.cluster_spec.gateloads) if num_gateways != num_gateloads: logger.interrupt( 'The cluster config file has different number of gateways({}) and gateloads({})' .format(num_gateways, num_gateloads)) self.kill_processes_gw() self.uninstall_package_gw() self.install_package_gw() self.start_sync_gateways() self.kill_processes_gl() self.uninstall_package_gl() self.install_package_gl()
def install(self): num_gateways = len(self.cluster_spec.gateways) num_gateloads = len(self.cluster_spec.gateloads) if num_gateways != num_gateloads: logger.interrupt( 'The cluster config file has different number of gateways({}) and gateloads({})' .format(num_gateways, num_gateloads) ) self.kill_processes_gw() self.uninstall_package_gw() self.install_package_gw() self.start_sync_gateways() self.kill_processes_gl() self.uninstall_package_gl() self.install_package_gl()
def retry(method: Callable, *args, **kwargs): r = namedtuple('request', ['url'])('') for _ in range(MAX_RETRY): try: r = method(*args, **kwargs) except ConnectionError: time.sleep(RETRY_DELAY * 2) continue if r.status_code in range(200, 203): return r else: logger.warn(r.text) logger.warn('Retrying {}'.format(r.url)) time.sleep(RETRY_DELAY) logger.interrupt('Request {} failed after {} attempts'.format( r.url, MAX_RETRY))
def __exit__(self, exc_type, exc_val, exc_tb): if self.test_config.test_case.use_workers: self.worker_manager.terminate() if exc_type != exc.KeyboardInterrupt: self.debug() for master in self.cluster_spec.yield_masters(): if not self.rest.is_balanced(master): logger.interrupt('Rebalance failed') num_failovers = self.rest.get_failover_counter(master) if hasattr(self, 'rebalance_settings'): if self.rebalance_settings.failover or \ self.rebalance_settings.graceful_failover: continue if num_failovers: logger.interrupt( 'Failover happened {} time(s)'.format(num_failovers))
def parse(self, fname, override): if override: override = [x for x in csv.reader( ' '.join(override).split(','), delimiter='.')] logger.info('Reading configuration file: {}'.format(fname)) if not os.path.isfile(fname): logger.interrupt('File doesn\'t exist: {}'.format(fname)) self.config.optionxform = str self.config.read(fname) for section, option, value in override: if not self.config.has_section(section): self.config.add_section(section) self.config.set(section, option, value) basename = os.path.basename(fname) self.name = os.path.splitext(basename)[0]
def monitor_node_health(self, host_port): logger.info('Monitoring node health') for retry in range(self.MAX_RETRY): unhealthy_nodes = { n for n, status in self.node_statuses(host_port).items() if status != 'healthy' } | { n for n, status in self.node_statuses_v2(host_port).items() if status != 'healthy' } if unhealthy_nodes: time.sleep(self.POLLING_INTERVAL) else: break else: logger.interrupt('Some nodes are not healthy: {}'.format( unhealthy_nodes ))
def configure_tcmalloc_aggressive_decommit(self, value): logger.info('Setting TCMalloc aggressive_decommit to {}'.format( bool(value))) mcctl = '/opt/couchbase/bin/mcctl -h localhost:11210 ' result = run_with_retry( mcctl + 'set tcmalloc.aggressive_memory_decommit {}'.format(value), attempts=10) if result.failed: logger.interrupt('Failed to set TCMalloc aggressive_decommit ' + '(after 10 attempts):\n{}'.format(result)) # Check it took effect. new_value = run_with_retry(mcctl + 'get tcmalloc.aggressive_memory_decommit', attempts=10) if value != int(new_value): logger.interrupt( 'Failed to set TCMalloc aggressive_decommit - requested:{}, got:{} -\n{}' .format(bool(value), bool(int(new_value)), new_value))
def _on_multi_fail(self, err, ops): """Multi failed, crack and handle failures with set.""" err.trap(exceptions.NotStoredError, exceptions.TimeoutError) if err.check(exceptions.NotStoredError): # One or more keys do not yet exist, handle with set for k, v in err.value.all_results.items(): logger.info('VAL: {}'.format(err.value)) if not v.success: if v.rc == LCB_NOT_STORED: # Snip off semicolon for initial value. logger.info('SET: {} {}'.format(k, ops[k][1:])) d = self.client.set(k, ops[k][1:], format=FMT_UTF8) d.addCallback(self._on_set) d.addErrback(self._on_set_fail) elif err == exceptions.TimeoutError: logger.interrupt('Timeout: {}'.format(err)) else: logger.interrupt('Unhandled error: {}'.format(err))
def monitor_rebalance(self, host_port): logger.info('Monitoring rebalance status') is_running = True last_progress = 0 last_progress_time = time.time() while is_running: time.sleep(self.POLLING_INTERVAL) is_running, progress = self.get_rebalance_status(host_port) if progress == last_progress: if time.time() - last_progress_time > self.REBALANCE_TIMEOUT: logger.interrupt('Rebalance hung') else: last_progress = progress last_progress_time = time.time() if progress is not None: logger.info('Rebalance progress: {} %'.format(progress)) logger.info('Rebalance completed')
def _wait_for_null_metric(self, host_port, bucket, metric): retry = 0 while retry < self.MAX_RETRY: time.sleep(self.POLLING_INTERVAL) bucket_stats = self.get_bucket_stats(host_port, bucket) try: value = bucket_stats['op']['samples'][metric][-1] except KeyError: logger.warn('Got broken bucket stats') retry += 1 continue else: retry = 0 if value: logger.info('Current value of {}: {}'.format(metric, value)) else: logger.info('{} reached 0'.format(metric)) return logger.interrupt('Failed to get bucket stats after {} attempts'.format( self.MAX_RETRY))
def consume(self): password = self.test_config.bucket.password for master in self.cluster_spec.yield_masters(): host = master.split(':')[0] memcached_port = self.rest.get_memcached_port(master) for bucket in self.test_config.buckets: logger.info( 'Reading data via UPR from {}/{}'.format(host, bucket) ) upr_client = DcpClient(host=host, port=memcached_port) upr_client.sasl_auth_plain(username=bucket, password=password) mcd_client = MemcachedClient(host=host, port=memcached_port) mcd_client.sasl_auth_plain(user=bucket, password=password) op = upr_client.open_producer('stream') response = op.next_response() if response['status'] != SUCCESS: logger.interrupt('Failed to open producer') for vb in range(1024): vb_stats = mcd_client.stats('vbucket-seqno {}'.format(vb)) uuid = long(vb_stats['vb_{}:uuid'.format(vb)]) high_seqno = long(vb_stats['vb_{}:high_seqno'.format(vb)]) op = upr_client.stream_req(vb=vb, flags=0, start_seqno=0, end_seqno=high_seqno, vb_uuid=uuid, high_seqno=high_seqno) while op.has_response(): response = op.next_response() if response['opcode'] != CMD_STREAM_REQ: break upr_client.close_stream(vbucket=vb) upr_client.shutdown()
def _interrupt(self, err): logger.interrupt(err.value)
def wait_for_all_workers(self): for workers in self.workers.values(): for worker in workers: worker.join() if worker.exitcode: logger.interrupt('Worker finished with non-zero exit code')
def collect_kpi(self): logger.info('Collecting Sync Gateway KPI') try: criteria = OrderedDict(( (95, self.test_config.gateload_settings.p95_avg_criteria), (99, self.test_config.gateload_settings.p99_avg_criteria), )) summary = defaultdict(dict) latencies = defaultdict(list) all_requests_per_sec = [] self.errors = [] for idx, gateload in enumerate(self.remote.gateloads, start=1): for p in criteria: kpi = self.KPI.format(p) latency = self.metric_helper.calc_push_latency(p=p, idx=idx) if latency == 0: status = '{}: Failed to get latency data'.format( gateload) self.errors.append(status) summary[gateload][kpi] = latency latencies[p].append(latency) requests_per_sec = self.metric_helper.calc_requests_per_sec( idx=idx) all_requests_per_sec.append(requests_per_sec) summary[gateload][ 'Average requests per sec'] = requests_per_sec doc_counters = self.metric_helper.calc_gateload_doc_counters( idx=idx) summary[gateload]['gateload doc counters'] = doc_counters logger.info('Per node summary: {}'.format(pretty_dict(summary))) self.reporter.post_to_sf(round(np.mean(latencies[99]), 1)) self.pass_fail = [] for p, criterion in criteria.items(): kpi = self.KPI.format(p) average = np.mean(latencies[p]) if average == 0 or average > criterion: status = "{}: {} - doesn't meet the criteria of {}"\ .format(kpi, average, criterion) else: status = '{}: {} - meets the criteria of {}'\ .format(kpi, average, criterion) self.pass_fail.append(status) logger.info('Aggregated latency: {}'.format( pretty_dict(self.pass_fail))) network_matrix = self.metric_db_servers_helper.calc_network_throughput network_matrix['Avg requests per sec'] = int( np.average(all_requests_per_sec)) logger.info('Network throughput: {}'.format( json.dumps(network_matrix, indent=4))) logger.info('Checking pass or fail') if self.errors: logger.interrupt('Test failed because of errors: {}'.format( self.errors)) if "doesn't meet" in ''.join(self.pass_fail): logger.interrupt('Test failed: latencies do not meet KPI') except: traceback.print_exc() traceback.print_stack() logger.interrupt('Exception running test: {}'.format( str(sys.exc_info()[0])))
def interrupt(request, *args, **kargs): try: return request(*args, **kargs) except (requests.ConnectionError, InternalServerError) as e: logger.interrupt(e)
def find_package(self, edition: str) -> [str, str]: for url in self.url_iterator(edition): if self.is_exist(url): return url logger.interrupt('Target build not found')
def _on_set_fail(self, err): logger.interrupt('ON_SET_FAIL'.format(err))
def find_package(self, edition: str, package: str = None, os_release: str = None) -> [str, str]: for url in self.url_iterator(edition, package, os_release): if self.is_exist(url): return url logger.interrupt('Target build not found')