Пример #1
0
def run_tests(config):
  """ The main function that launches the stress tests """
  # Build docker images and push to GKE registry
  if config.global_settings.build_docker_images:
    for name, docker_image in config.docker_images_dict.iteritems():
      if not (docker_image.build_image() and
              docker_image.push_to_gke_registry()):
        return False

  # Create a unique id for this run (Note: Using timestamp instead of UUID to
  # make it easier to deduce the date/time of the run just by looking at the run
  # run id. This is useful in debugging when looking at records in Biq query)
  run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
  dataset_id = '%s_%s' % (config.global_settings.dataset_id_prefix, run_id)

  bq_helper = BigQueryHelper(run_id, '', '',
                             config.global_settings.gcp_project_id, dataset_id,
                             config.global_settings.summary_table_id,
                             config.global_settings.qps_table_id)
  bq_helper.initialize()

  gke = Gke(config.global_settings.gcp_project_id, run_id, dataset_id,
            config.global_settings.summary_table_id,
            config.global_settings.qps_table_id,
            config.global_settings.kubernetes_proxy_port)

  is_success = True

  try:
    print 'Launching servers..'
    for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
      if not gke.launch_servers(server_pod_spec):
        is_success = False  # is_success is checked in the 'finally' block
        return False

    print('Launched servers. Waiting for %d seconds for the server pods to be '
          'fully online') % config.global_settings.pod_warmup_secs
    time.sleep(config.global_settings.pod_warmup_secs)

    for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
      if not gke.launch_clients(client_pod_spec):
        is_success = False  # is_success is checked in the 'finally' block
        return False

    print('Launched all clients. Waiting for %d seconds for the client pods to '
          'be fully online') % config.global_settings.pod_warmup_secs
    time.sleep(config.global_settings.pod_warmup_secs)

    start_time = datetime.datetime.now()
    end_time = start_time + datetime.timedelta(
        seconds=config.global_settings.test_duration_secs)
    print 'Running the test until %s' % end_time.isoformat()

    while True:
      if datetime.datetime.now() > end_time:
        print 'Test was run for %d seconds' % config.global_settings.test_duration_secs
        break

      # Check if either stress server or clients have failed (btw, the bq_helper
      # monitors all the rows in the summary table and checks if any of them
      # have a failure status)
      if bq_helper.check_if_any_tests_failed():
        is_success = False
        print 'Some tests failed.'
        break  # Don't 'return' here. We still want to call bq_helper to print qps/summary tables

      # Tests running fine. Wait until next poll time to check the status
      print 'Sleeping for %d seconds..' % config.global_settings.test_poll_interval_secs
      time.sleep(config.global_settings.test_poll_interval_secs)

    # Print BiqQuery tables
    bq_helper.print_qps_records()
    bq_helper.print_summary_records()

  finally:
    # If there was a test failure, we should not delete the pods since they
    # would contain useful debug information (logs, core dumps etc)
    if is_success:
      for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
        gke.delete_servers(server_pod_spec)
      for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
        gke.delete_clients(client_pod_spec)

  return is_success
Пример #2
0
def run_client():
  """This is a wrapper around the stress test client and performs the following:
      1) Create the following two tables in Big Query:
         (i) Summary table: To record events like the test started, completed
                            successfully or failed
        (ii) Qps table: To periodically record the QPS sent by this client
      2) Start the stress test client and add a row in the Big Query summary
         table
      3) Once every few seconds (as specificed by the poll_interval_secs) poll
         the status of the stress test client process and perform the
         following:
          3.1) If the process is still running, get the current qps by invoking
               the metrics client program and add a row in the Big Query
               Qps table. Sleep for a duration specified by poll_interval_secs
          3.2) If the process exited successfully, add a row in the Big Query
               Summary table and exit
          3.3) If the process failed, add a row in Big Query summary table and
               wait forever.
               NOTE: This script typically runs inside a GKE pod which means
               that the pod gets destroyed when the script exits. However, in
               case the stress test client fails, we would not want the pod to
               be destroyed (since we might want to connect to the pod for
               examining logs). This is the reason why the script waits forever
               in case of failures
  """
  # Set the 'core file' size to 'unlimited' so that 'core' files are generated
  # if the client crashes (Note: This is not relevant for Java and Go clients)
  resource.setrlimit(resource.RLIMIT_CORE,
                     (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

  env = dict(os.environ)
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  stress_client_cmd = env['STRESS_TEST_CMD'].split()
  args_str = env['STRESS_TEST_ARGS_STR']
  metrics_client_cmd = env['METRICS_CLIENT_CMD'].split()
  metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR']
  run_id = env['RUN_ID']
  pod_name = env['POD_NAME']
  logfile_name = env.get('LOGFILE_NAME')
  poll_interval_secs = float(env['POLL_INTERVAL_SECS'])
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']
  # The following parameter is to inform us whether the stress client runs
  # forever until forcefully stopped or will it naturally stop after sometime.
  # This way, we know that the stress client process should not terminate (even
  # if it does with a success exit code) and flag the termination as a failure
  will_run_forever = env.get('WILL_RUN_FOREVER', '1')

  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()

  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return

  start_time = datetime.datetime.now()

  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening logfile: %s ...' % logfile_name
    details = 'Logfile: %s' % logfile_name
    logfile = open(logfile_name, 'w')

  metrics_cmd = metrics_client_cmd + [x
                                      for x in metrics_client_args_str.split()]
  stress_cmd = stress_client_cmd + [x for x in args_str.split()]

  details = '%s, Metrics command: %s, Stress client command: %s' % (
      details, str(metrics_cmd), str(stress_cmd))
  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)

  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)

  qps_history = [1, 1, 1]  # Maintain the last 3 qps readings
  qps_history_idx = 0  # Index into the qps_history list

  is_running_status_written = False
  is_error = False
  while True:
    # Check if stress_client is still running. If so, collect metrics and upload
    # to BigQuery status table
    # If stress_p.poll() is not None, it means that the stress client terminated
    if stress_p.poll() is not None:
      end_time = datetime.datetime.now().isoformat()
      event_type = EventType.SUCCESS
      details = 'End time: %s' % end_time
      if will_run_forever == '1' or stress_p.returncode != 0:
        event_type = EventType.FAILURE
        details = 'Return code = %d. End time: %s' % (stress_p.returncode,
                                                      end_time)
        is_error = True
      bq_helper.insert_summary_row(event_type, details)
      print details
      break

    if not is_running_status_written:
      bq_helper.insert_summary_row(EventType.RUNNING, '')
      is_running_status_written = True

    # Stress client still running. Get metrics
    qps = _get_qps(metrics_cmd)
    qps_recorded_at = datetime.datetime.now().isoformat()
    print 'qps: %d at %s' % (qps, qps_recorded_at)

    # If QPS has been zero for the last 3 iterations, flag it as error and exit
    qps_history[qps_history_idx] = qps
    qps_history_idx = (qps_history_idx + 1) % len(qps_history)
    if sum(qps_history) == 0:
      details = 'QPS has been zero for the last %d seconds - as of : %s' % (
          poll_interval_secs * 3, qps_recorded_at)
      is_error = True
      bq_helper.insert_summary_row(EventType.FAILURE, details)
      print details
      break

    # Upload qps metrics to BiqQuery
    bq_helper.insert_qps_row(qps, qps_recorded_at)

    time.sleep(poll_interval_secs)

  if is_error:
    print 'Waiting indefinitely..'
    select.select([], [], [])

  print 'Completed'
  return
Пример #3
0
import argparse
import os
import sys

stress_test_utils_dir = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '../../gcp/stress_test'))
sys.path.append(stress_test_utils_dir)
from stress_test_utils import BigQueryHelper

argp = argparse.ArgumentParser(
    description='Print summary tables',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
argp.add_argument('--gcp_project_id',
                  required=True,
                  help='The Google Cloud Platform Project Id')
argp.add_argument('--dataset_id', type=str, required=True)
argp.add_argument('--run_id', type=str, required=True)
argp.add_argument('--summary_table_id', type=str, default='summary')
argp.add_argument('--qps_table_id', type=str, default='qps')
argp.add_argument('--summary_only', action='store_true', default=True)

if __name__ == '__main__':
    args = argp.parse_args()
    bq_helper = BigQueryHelper(args.run_id, '', '', args.gcp_project_id,
                               args.dataset_id, args.summary_table_id,
                               args.qps_table_id)
    bq_helper.initialize()
    if not args.summary_only:
        bq_helper.print_qps_records()
    bq_helper.print_summary_records()
Пример #4
0
def run_tests(config):
    """ The main function that launches the stress tests """
    # Build docker images and push to GKE registry
    if config.global_settings.build_docker_images:
        for name, docker_image in config.docker_images_dict.iteritems():
            if not (docker_image.build_image()
                    and docker_image.push_to_gke_registry()):
                return False

    # Create a unique id for this run (Note: Using timestamp instead of UUID to
    # make it easier to deduce the date/time of the run just by looking at the run
    # run id. This is useful in debugging when looking at records in Biq query)
    run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    dataset_id = '%s_%s' % (config.global_settings.dataset_id_prefix, run_id)
    print 'Run id:', run_id
    print 'Dataset id:', dataset_id

    bq_helper = BigQueryHelper(run_id, '', '',
                               config.global_settings.gcp_project_id,
                               dataset_id,
                               config.global_settings.summary_table_id,
                               config.global_settings.qps_table_id)
    bq_helper.initialize()

    gke = Gke(config.global_settings.gcp_project_id, run_id, dataset_id,
              config.global_settings.summary_table_id,
              config.global_settings.qps_table_id,
              config.global_settings.kubernetes_proxy_port)

    is_success = True

    try:
        print 'Launching servers..'
        for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
            if not gke.launch_servers(server_pod_spec):
                is_success = False  # is_success is checked in the 'finally' block
                return False

        print(
            'Launched servers. Waiting for %d seconds for the server pods to be '
            'fully online') % config.global_settings.pod_warmup_secs
        time.sleep(config.global_settings.pod_warmup_secs)

        for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
            if not gke.launch_clients(client_pod_spec):
                is_success = False  # is_success is checked in the 'finally' block
                return False

        print(
            'Launched all clients. Waiting for %d seconds for the client pods to '
            'be fully online') % config.global_settings.pod_warmup_secs
        time.sleep(config.global_settings.pod_warmup_secs)

        start_time = datetime.datetime.now()
        end_time = start_time + datetime.timedelta(
            seconds=config.global_settings.test_duration_secs)
        print 'Running the test until %s' % end_time.isoformat()

        while True:
            if datetime.datetime.now() > end_time:
                print 'Test was run for %d seconds' % config.global_settings.test_duration_secs
                break

            # Check if either stress server or clients have failed (btw, the bq_helper
            # monitors all the rows in the summary table and checks if any of them
            # have a failure status)
            if bq_helper.check_if_any_tests_failed():
                is_success = False
                print 'Some tests failed.'
                break  # Don't 'return' here. We still want to call bq_helper to print qps/summary tables

            # Tests running fine. Wait until next poll time to check the status
            print 'Sleeping for %d seconds..' % config.global_settings.test_poll_interval_secs
            time.sleep(config.global_settings.test_poll_interval_secs)

        # Print BiqQuery tables
        bq_helper.print_qps_records()
        bq_helper.print_summary_records()

    finally:
        # If there was a test failure, we should not delete the pods since they
        # would contain useful debug information (logs, core dumps etc)
        if is_success:
            for name, server_pod_spec in config.server_pod_specs_dict.iteritems(
            ):
                gke.delete_servers(server_pod_spec)
            for name, client_pod_spec in config.client_pod_specs_dict.iteritems(
            ):
                gke.delete_clients(client_pod_spec)

    return is_success
Пример #5
0
def run_test_main(test_settings, gke_settings, stress_server_settings,
                  stress_client_clients):
  is_success = True

  if test_settings.build_docker_image:
    is_success = _build_docker_image(gke_settings.docker_image_name,
                                     gke_settings.tag_name)
    if not is_success:
      return False

    is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
    if not is_success:
      return False

  # Create a unique id for this run (Note: Using timestamp instead of UUID to
  # make it easier to deduce the date/time of the run just by looking at the run
  # run id. This is useful in debugging when looking at records in Biq query)
  run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
  dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)

  # Big Query settings (common for both Stress Server and Client)
  bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
                                 _QPS_TABLE_ID)

  bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
                             _SUMMARY_TABLE_ID, _QPS_TABLE_ID)
  bq_helper.initialize()

  try:
    is_success = _launch_server_and_client(gke_settings, stress_server_settings,
                                           stress_client_settings, bq_settings,
                                           test_settings.kubernetes_proxy_port)
    if not is_success:
      return False

    start_time = datetime.datetime.now()
    end_time = start_time + datetime.timedelta(
        seconds=test_settings.test_duration_secs)
    print 'Running the test until %s' % end_time.isoformat()

    while True:
      if datetime.datetime.now() > end_time:
        print 'Test was run for %d seconds' % test_settings.test_duration_secs
        break

      # Check if either stress server or clients have failed
      if bq_helper.check_if_any_tests_failed():
        is_success = False
        print 'Some tests failed.'
        break

      # Things seem to be running fine. Wait until next poll time to check the
      # status
      print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
      time.sleep(test_settings.test_poll_interval_secs)

    # Print BiqQuery tables
    bq_helper.print_summary_records()
    bq_helper.print_qps_records()

  finally:
    # If is_success is False at this point, it means that the stress tests were
    # started successfully but failed while running the tests. In this case we
    # do should not delete the pods (since they contain all the failure
    # information)
    if is_success:
      _delete_server_and_client(stress_server_settings, stress_client_settings,
                                test_settings.kubernetes_proxy_port)

  return is_success
Пример #6
0
import argparse
import os
import sys

stress_test_utils_dir = os.path.abspath(os.path.join(
    os.path.dirname(__file__), '../../gcp/stress_test'))
sys.path.append(stress_test_utils_dir)
from stress_test_utils import BigQueryHelper

argp = argparse.ArgumentParser(
    description='Print summary tables',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
argp.add_argument('--gcp_project_id',
                  required=True,
                  help='The Google Cloud Platform Project Id')
argp.add_argument('--dataset_id', type=str, required=True)
argp.add_argument('--run_id', type=str, required=True)
argp.add_argument('--summary_table_id', type=str, default='summary')
argp.add_argument('--qps_table_id', type=str, default='qps')
argp.add_argument('--summary_only', action='store_true', default=True)

if __name__ == '__main__':
  args = argp.parse_args()
  bq_helper = BigQueryHelper(args.run_id, '', '', args.gcp_project_id,
                             args.dataset_id, args.summary_table_id,
                             args.qps_table_id)
  bq_helper.initialize()
  if not args.summary_only:
    bq_helper.print_qps_records()
  bq_helper.print_summary_records()
Пример #7
0
def run_server():
    """This is a wrapper around the interop server and performs the following:
      1) Create a 'Summary table' in Big Query to record events like the server
         started, completed successfully or failed. NOTE: This also creates
         another table called the QPS table which is currently NOT needed on the
         server (it is needed on the stress test clients)
      2) Start the server process and add a row in Big Query summary table
      3) Wait for the server process to terminate. The server process does not
         terminate unless there is an error.
         If the server process terminated with a failure, add a row in Big Query
         and wait forever.
         NOTE: This script typically runs inside a GKE pod which means that the
         pod gets destroyed when the script exits. However, in case the server
         process fails, we would not want the pod to be destroyed (since we
         might want to connect to the pod for examining logs). This is the
         reason why the script waits forever in case of failures.
  """
    # Set the 'core file' size to 'unlimited' so that 'core' files are generated
    # if the server crashes (Note: This is not relevant for Java and Go servers)
    resource.setrlimit(resource.RLIMIT_CORE,
                       (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

    # Read the parameters from environment variables
    env = dict(os.environ)

    run_id = env['RUN_ID']  # The unique run id for this test
    image_type = env['STRESS_TEST_IMAGE_TYPE']
    stress_server_cmd = env['STRESS_TEST_CMD'].split()
    args_str = env['STRESS_TEST_ARGS_STR']
    pod_name = env['POD_NAME']
    project_id = env['GCP_PROJECT_ID']
    dataset_id = env['DATASET_ID']
    summary_table_id = env['SUMMARY_TABLE_ID']
    qps_table_id = env['QPS_TABLE_ID']
    # The following parameter is to inform us whether the server runs forever
    # until forcefully stopped or will it naturally stop after sometime.
    # This way, we know that the process should not terminate (even if it does
    # with a success exit code) and flag any termination as a failure.
    will_run_forever = env.get('WILL_RUN_FOREVER', '1')

    logfile_name = env.get('LOGFILE_NAME')

    print(
        'pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, '
        'summary_table_id: %s, qps_table_id: %s') % (
            pod_name, project_id, run_id, dataset_id, summary_table_id,
            qps_table_id)

    bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                               dataset_id, summary_table_id, qps_table_id)
    bq_helper.initialize()

    # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
    if not bq_helper.setup_tables():
        print 'Error in creating BigQuery tables'
        return

    start_time = datetime.datetime.now()

    logfile = None
    details = 'Logging to stdout'
    if logfile_name is not None:
        print 'Opening log file: ', logfile_name
        logfile = open(logfile_name, 'w')
        details = 'Logfile: %s' % logfile_name

    # Update status that the test is starting (in the status table)
    bq_helper.insert_summary_row(EventType.STARTING, details)

    stress_cmd = stress_server_cmd + [x for x in args_str.split()]

    print 'Launching process %s ...' % stress_cmd
    stress_p = subprocess.Popen(args=stress_cmd,
                                stdout=logfile,
                                stderr=subprocess.STDOUT)

    returncode = stress_p.wait()

    if will_run_forever == '1' or returncode != 0:
        end_time = datetime.datetime.now().isoformat()
        event_type = EventType.FAILURE
        details = 'Returncode: %d; End time: %s' % (returncode, end_time)
        bq_helper.insert_summary_row(event_type, details)
        print 'Waiting indefinitely..'
        select.select([], [], [])
    return returncode
Пример #8
0
def run_client():
  """This is a wrapper around the stress test client and performs the following:
      1) Create the following two tables in Big Query:
         (i) Summary table: To record events like the test started, completed
                            successfully or failed
        (ii) Qps table: To periodically record the QPS sent by this client
      2) Start the stress test client and add a row in the Big Query summary
         table
      3) Once every few seconds (as specificed by the poll_interval_secs) poll
         the status of the stress test client process and perform the
         following:
          3.1) If the process is still running, get the current qps by invoking
               the metrics client program and add a row in the Big Query
               Qps table. Sleep for a duration specified by poll_interval_secs
          3.2) If the process exited successfully, add a row in the Big Query
               Summary table and exit
          3.3) If the process failed, add a row in Big Query summary table and
               wait forever.
               NOTE: This script typically runs inside a GKE pod which means
               that the pod gets destroyed when the script exits. However, in
               case the stress test client fails, we would not want the pod to
               be destroyed (since we might want to connect to the pod for
               examining logs). This is the reason why the script waits forever
               in case of failures
  """
  env = dict(os.environ)
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  image_name = env['STRESS_TEST_IMAGE']
  args_str = env['STRESS_TEST_ARGS_STR']
  metrics_client_image = env['METRICS_CLIENT_IMAGE']
  metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR']
  run_id = env['RUN_ID']
  pod_name = env['POD_NAME']
  logfile_name = env.get('LOGFILE_NAME')
  poll_interval_secs = float(env['POLL_INTERVAL_SECS'])
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']

  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()

  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return

  start_time = datetime.datetime.now()

  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening logfile: %s ...' % logfile_name
    details = 'Logfile: %s' % logfile_name
    logfile = open(logfile_name, 'w')

  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)

  metrics_cmd = [metrics_client_image
                ] + [x for x in metrics_client_args_str.split()]
  stress_cmd = [image_name] + [x for x in args_str.split()]

  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)

  qps_history = [1, 1, 1]  # Maintain the last 3 qps readings
  qps_history_idx = 0  # Index into the qps_history list

  is_error = False
  while True:
    # Check if stress_client is still running. If so, collect metrics and upload
    # to BigQuery status table
    if stress_p.poll() is not None:
      end_time = datetime.datetime.now().isoformat()
      event_type = EventType.SUCCESS
      details = 'End time: %s' % end_time
      if stress_p.returncode != 0:
        event_type = EventType.FAILURE
        details = 'Return code = %d. End time: %s' % (stress_p.returncode,
                                                      end_time)
        is_error = True
      bq_helper.insert_summary_row(event_type, details)
      print details
      break

    # Stress client still running. Get metrics
    qps = _get_qps(metrics_cmd)
    qps_recorded_at = datetime.datetime.now().isoformat()
    print 'qps: %d at %s' % (qps, qps_recorded_at)

    # If QPS has been zero for the last 3 iterations, flag it as error and exit
    qps_history[qps_history_idx] = qps
    qps_history_idx = (qps_history_idx + 1) % len(qps_history)
    if sum(qps_history) == 0:
      details = 'QPS has been zero for the last %d seconds - as of : %s' % (
          poll_interval_secs * 3, qps_recorded_at)
      is_error = True
      bq_helper.insert_summary_row(EventType.FAILURE, details)
      print details
      break

    # Upload qps metrics to BiqQuery
    bq_helper.insert_qps_row(qps, qps_recorded_at)

    time.sleep(poll_interval_secs)

  if is_error:
    print 'Waiting indefinitely..'
    select.select([], [], [])

  print 'Completed'
  return
Пример #9
0
def run_server():
  """This is a wrapper around the interop server and performs the following:
      1) Create a 'Summary table' in Big Query to record events like the server
         started, completed successfully or failed. NOTE: This also creates
         another table called the QPS table which is currently NOT needed on the
         server (it is needed on the stress test clients)
      2) Start the server process and add a row in Big Query summary table
      3) Wait for the server process to terminate. The server process does not
         terminate unless there is an error.
         If the server process terminated with a failure, add a row in Big Query
         and wait forever.
         NOTE: This script typically runs inside a GKE pod which means that the
         pod gets destroyed when the script exits. However, in case the server
         process fails, we would not want the pod to be destroyed (since we
         might want to connect to the pod for examining logs). This is the
         reason why the script waits forever in case of failures.
  """

  # Read the parameters from environment variables
  env = dict(os.environ)

  run_id = env['RUN_ID']  # The unique run id for this test
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  stress_server_cmd = env['STRESS_TEST_CMD'].split()
  args_str = env['STRESS_TEST_ARGS_STR']
  pod_name = env['POD_NAME']
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']

  logfile_name = env.get('LOGFILE_NAME')

  print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, '
        'summary_table_id: %s, qps_table_id: %s') % (
            pod_name, project_id, run_id, dataset_id, summary_table_id,
            qps_table_id)

  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()

  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return

  start_time = datetime.datetime.now()

  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening log file: ', logfile_name
    logfile = open(logfile_name, 'w')
    details = 'Logfile: %s' % logfile_name

  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)

  stress_cmd = stress_server_cmd + [x for x in args_str.split()]

  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)

  returncode = stress_p.wait()
  if returncode != 0:
    end_time = datetime.datetime.now().isoformat()
    event_type = EventType.FAILURE
    details = 'Returncode: %d; End time: %s' % (returncode, end_time)
    bq_helper.insert_summary_row(event_type, details)
    print 'Waiting indefinitely..'
    select.select([], [], [])
  return returncode
Пример #10
0
def run_server():
  """This is a wrapper around the interop server and performs the following:
      1) Create a 'Summary table' in Big Query to record events like the server
         started, completed successfully or failed. NOTE: This also creates
         another table called the QPS table which is currently NOT needed on the
         server (it is needed on the stress test clients)
      2) Start the server process and add a row in Big Query summary table
      3) Wait for the server process to terminate. The server process does not
         terminate unless there is an error.
         If the server process terminated with a failure, add a row in Big Query
         and wait forever.
         NOTE: This script typically runs inside a GKE pod which means that the
         pod gets destroyed when the script exits. However, in case the server
         process fails, we would not want the pod to be destroyed (since we
         might want to connect to the pod for examining logs). This is the
         reason why the script waits forever in case of failures.
  """
  # Set the 'core file' size to 'unlimited' so that 'core' files are generated
  # if the server crashes (Note: This is not relevant for Java and Go servers)
  resource.setrlimit(resource.RLIMIT_CORE,
                     (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

  # Read the parameters from environment variables
  env = dict(os.environ)

  run_id = env['RUN_ID']  # The unique run id for this test
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  stress_server_cmd = env['STRESS_TEST_CMD'].split()
  args_str = env['STRESS_TEST_ARGS_STR']
  pod_name = env['POD_NAME']
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']
  # The following parameter is to inform us whether the server runs forever
  # until forcefully stopped or will it naturally stop after sometime.
  # This way, we know that the process should not terminate (even if it does
  # with a success exit code) and flag any termination as a failure.
  will_run_forever = env.get('WILL_RUN_FOREVER', '1')

  logfile_name = env.get('LOGFILE_NAME')

  print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, '
        'summary_table_id: %s, qps_table_id: %s') % (pod_name, project_id,
                                                     run_id, dataset_id,
                                                     summary_table_id,
                                                     qps_table_id)

  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()

  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return

  start_time = datetime.datetime.now()

  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening log file: ', logfile_name
    logfile = open(logfile_name, 'w')
    details = 'Logfile: %s' % logfile_name

  stress_cmd = stress_server_cmd + [x for x in args_str.split()]

  details = '%s, Stress server command: %s' % (details, str(stress_cmd))
  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)

  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)

  # Update the status to running if subprocess.Popen launched the server
  if stress_p.poll() is None:
    bq_helper.insert_summary_row(EventType.RUNNING, '')

  # Wait for the server process to terminate
  returncode = stress_p.wait()

  if will_run_forever == '1' or returncode != 0:
    end_time = datetime.datetime.now().isoformat()
    event_type = EventType.FAILURE
    details = 'Returncode: %d; End time: %s' % (returncode, end_time)
    bq_helper.insert_summary_row(event_type, details)
    print 'Waiting indefinitely..'
    select.select([], [], [])
  return returncode
Пример #11
0
def run_test_main(test_settings, gke_settings, stress_server_settings,
                  stress_client_clients):
    is_success = True

    if test_settings.build_docker_image:
        is_success = _build_docker_image(gke_settings.docker_image_name,
                                         gke_settings.tag_name)
        if not is_success:
            return False

        is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
        if not is_success:
            return False

    # Create a unique id for this run (Note: Using timestamp instead of UUID to
    # make it easier to deduce the date/time of the run just by looking at the run
    # run id. This is useful in debugging when looking at records in Biq query)
    run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)

    # Big Query settings (common for both Stress Server and Client)
    bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
                                   _QPS_TABLE_ID)

    bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
                               _SUMMARY_TABLE_ID, _QPS_TABLE_ID)
    bq_helper.initialize()

    try:
        is_success = _launch_server_and_client(
            gke_settings, stress_server_settings, stress_client_settings,
            bq_settings, test_settings.kubernetes_proxy_port)
        if not is_success:
            return False

        start_time = datetime.datetime.now()
        end_time = start_time + datetime.timedelta(
            seconds=test_settings.test_duration_secs)
        print 'Running the test until %s' % end_time.isoformat()

        while True:
            if datetime.datetime.now() > end_time:
                print 'Test was run for %d seconds' % test_settings.test_duration_secs
                break

            # Check if either stress server or clients have failed
            if bq_helper.check_if_any_tests_failed():
                is_success = False
                print 'Some tests failed.'
                break

            # Things seem to be running fine. Wait until next poll time to check the
            # status
            print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
            time.sleep(test_settings.test_poll_interval_secs)

        # Print BiqQuery tables
        bq_helper.print_summary_records()
        bq_helper.print_qps_records()

    finally:
        # If is_success is False at this point, it means that the stress tests were
        # started successfully but failed while running the tests. In this case we
        # do should not delete the pods (since they contain all the failure
        # information)
        if is_success:
            _delete_server_and_client(stress_server_settings,
                                      stress_client_settings,
                                      test_settings.kubernetes_proxy_port)

    return is_success