示例#1
0
def report_outcome(config, archive, summary, fake_ctx):
    """ Reports on the final outcome of the command. """
    status = get_status(summary)
    passed = status == 'pass'

    if not passed and bool(config.get('nuke-on-error')):
        # only unlock if we locked them in the first place
        nuke(fake_ctx, fake_ctx.lock)

    if archive is not None:
        with file(os.path.join(archive, 'summary.yaml'), 'w') as f:
            yaml.safe_dump(summary, f, default_flow_style=False)

    with contextlib.closing(StringIO.StringIO()) as f:
        yaml.safe_dump(summary, f)
        log.info('Summary data:\n%s' % f.getvalue())

    with contextlib.closing(StringIO.StringIO()) as f:
        if ('email-on-error' in config and not passed):
            yaml.safe_dump(summary, f)
            yaml.safe_dump(config, f)
            emsg = f.getvalue()
            subject = "Teuthology error -- %s" % summary['failure_reason']
            email_results(subject, "Teuthology", config['email-on-error'],
                          emsg)

    report.try_push_job_info(config, summary)

    if passed:
        log.info(status)
    else:
        log.info(str(status).upper())
        sys.exit(1)
示例#2
0
def run_with_watchdog(process, job_config):
    job_start_time = datetime.utcnow()

    # Only push the information that's relevant to the watchdog, to save db
    # load
    job_info = dict(
        name=job_config['name'],
        job_id=job_config['job_id'],
    )

    # Sleep once outside of the loop to avoid double-posting jobs
    time.sleep(teuth_config.watchdog_interval)
    hit_max_timeout = False
    while process.poll() is None:
        # Kill jobs that have been running longer than the global max
        run_time = datetime.utcnow() - job_start_time
        total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
        if total_seconds > teuth_config.max_job_time:
            hit_max_timeout = True
            log.warning("Job ran longer than {max}s. Killing...".format(
                max=teuth_config.max_job_time))
            try:
                # kill processes but do not unlock yet so we can save
                # the logs, coredumps, etc.
                kill_job(job_info['name'], job_info['job_id'],
                         teuth_config.archive_base, job_config['owner'],
                         save_logs=True)
            except Exception:
                log.exception('Failed to kill job')

            try:
                transfer_archives(job_info['name'], job_info['job_id'],
                                  teuth_config.archive_base, job_config)
            except Exception:
                log.exception('Could not save logs')

            try:
                # this time remove everything and unlock the machines
                kill_job(job_info['name'], job_info['job_id'],
                         teuth_config.archive_base, job_config['owner'])
            except Exception:
                log.exception('Failed to kill job and unlock machines')

        # calling this without a status just updates the jobs updated time
        report.try_push_job_info(job_info)
        time.sleep(teuth_config.watchdog_interval)

    # we no longer support testing theses old branches
    assert(job_config.get('teuthology_branch') not in ('argonaut', 'bobtail',
                                                       'cuttlefish', 'dumpling'))

    # Let's make sure that paddles knows the job is finished. We don't know
    # the status, but if it was a pass or fail it will have already been
    # reported to paddles. In that case paddles ignores the 'dead' status.
    # If the job was killed, paddles will use the 'dead' status.
    extra_info = dict(status='dead')
    if hit_max_timeout:
        extra_info['failure_reason'] = 'hit max job timeout'
    report.try_push_job_info(job_info, extra_info)
示例#3
0
def lock_machines(job_config):
    report.try_push_job_info(job_config, dict(status='running'))
    fake_ctx = supervisor.create_fake_context(job_config, block=True)
    block_and_lock_machines(fake_ctx,
                            len(job_config['roles']),
                            job_config['machine_type'],
                            reimage=False)
    job_config = fake_ctx.config
    return job_config
示例#4
0
def run_with_watchdog(process, job_config):
    job_start_time = datetime.utcnow()

    # Only push the information that's relevant to the watchdog, to save db
    # load
    job_info = dict(
        name=job_config['name'],
        job_id=job_config['job_id'],
    )

    # Sleep once outside of the loop to avoid double-posting jobs
    time.sleep(teuth_config.watchdog_interval)
    symlink_worker_log(job_config['worker_log'], job_config['archive_path'])
    while process.poll() is None:
        # Kill jobs that have been running longer than the global max
        run_time = datetime.utcnow() - job_start_time
        total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
        if total_seconds > teuth_config.max_job_time:
            log.warning("Job ran longer than {max}s. Killing...".format(
                max=teuth_config.max_job_time))
            kill_job(job_info['name'], job_info['job_id'],
                     teuth_config.archive_base, job_config['owner'])

        # calling this without a status just updates the jobs updated time
        report.try_push_job_info(job_info)
        time.sleep(teuth_config.watchdog_interval)

    # The job finished. Let's make sure paddles knows.
    branches_sans_reporting = ('argonaut', 'bobtail', 'cuttlefish', 'dumpling')
    if job_config.get('teuthology_branch') in branches_sans_reporting:
        # The job ran with a teuthology branch that may not have the reporting
        # feature. Let's call teuthology-report (which will be from the master
        # branch) to report the job manually.
        cmd = "teuthology-report -v -D -r {run_name} -j {job_id}".format(
            run_name=job_info['name'], job_id=job_info['job_id'])
        try:
            log.info("Executing %s" % cmd)
            report_proc = subprocess.Popen(cmd,
                                           shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.STDOUT)
            while report_proc.poll() is None:
                for line in report_proc.stdout.readlines():
                    log.info(line.strip())
                time.sleep(1)
            log.info("Reported results via the teuthology-report command")
        except Exception:
            log.exception("teuthology-report failed")
    else:
        # Let's make sure that paddles knows the job is finished. We don't know
        # the status, but if it was a pass or fail it will have already been
        # reported to paddles. In that case paddles ignores the 'dead' status.
        # If the job was killed, paddles will use the 'dead' status.
        report.try_push_job_info(job_info, dict(status='dead'))
示例#5
0
def check_packages(ctx, config):
    """
    Checks gitbuilder to determine if there are missing packages for this job.

    If there are missing packages, fail the job.
    """
    for task in ctx.config['tasks']:
        if task.keys()[0] == 'buildpackages':
            log.info("Checking packages skipped because "
                     "the task buildpackages was found.")
            return

    log.info("Checking packages...")
    os_type = ctx.config.get("os_type")
    sha1 = ctx.config.get("sha1")
    # We can only do this check if there are a defined sha1 and os_type
    # in the job config.
    if os_type and sha1:
        package = get_builder_project()("ceph", ctx.config)
        template = "Checking packages for os_type '{os}', " \
            "flavor '{flav}' and ceph hash '{ver}'"
        log.info(
            template.format(
                os=package.os_type,
                flav=package.flavor,
                ver=package.sha1,
            )
        )
        if package.version:
            log.info("Found packages for ceph version {ver}".format(
                ver=package.version
            ))
        else:
            msg = "Packages for distro '{d}' and ceph hash '{ver}' not found"
            msg = msg.format(
                d=package.distro,
                ver=package.sha1,
            )
            log.error(msg)
            # set the failure message and update paddles with the status
            ctx.summary["failure_reason"] = msg
            set_status(ctx.summary, "dead")
            report.try_push_job_info(ctx.config, dict(status='dead'))
            raise VersionNotFoundError(package.base_url)
    else:
        log.info(
            "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format(
                os=os_type,
                ver=sha1,
            )
        )
示例#6
0
def prep_job(job_config, log_file_path, archive_dir):
    job_id = job_config['job_id']
    safe_archive = safepath.munge(job_config['name'])
    job_config['worker_log'] = log_file_path
    archive_path_full = os.path.join(archive_dir, safe_archive, str(job_id))
    job_config['archive_path'] = archive_path_full

    # If the teuthology branch was not specified, default to master and
    # store that value.
    teuthology_branch = job_config.get('teuthology_branch', 'master')
    job_config['teuthology_branch'] = teuthology_branch

    try:
        if teuth_config.teuthology_path is not None:
            teuth_path = teuth_config.teuthology_path
        else:
            teuth_path = fetch_teuthology(branch=teuthology_branch)
        # For the teuthology tasks, we look for suite_branch, and if we
        # don't get that, we look for branch, and fall back to 'master'.
        # last-in-suite jobs don't have suite_branch or branch set.
        ceph_branch = job_config.get('branch', 'master')
        suite_branch = job_config.get('suite_branch', ceph_branch)
        suite_repo = job_config.get('suite_repo')
        if suite_repo:
            teuth_config.ceph_qa_suite_git_url = suite_repo
        job_config['suite_path'] = os.path.normpath(
            os.path.join(
                fetch_qa_suite(suite_branch),
                job_config.get('suite_relpath', ''),
            ))
    except BranchNotFoundError as exc:
        log.exception("Branch not found; marking job as dead")
        report.try_push_job_info(job_config,
                                 dict(status='dead', failure_reason=str(exc)))
        raise SkipJob()
    except MaxWhileTries as exc:
        log.exception("Failed to fetch or bootstrap; marking job as dead")
        report.try_push_job_info(job_config,
                                 dict(status='dead', failure_reason=str(exc)))
        raise SkipJob()

    teuth_bin_path = os.path.join(teuth_path, 'virtualenv', 'bin')
    if not os.path.isdir(teuth_bin_path):
        raise RuntimeError("teuthology branch %s at %s not bootstrapped!" %
                           (teuthology_branch, teuth_bin_path))
    return job_config, teuth_bin_path
示例#7
0
def schedule_job(job_config, num=1):
    """
    Schedule a job.

    :param job_config: The complete job dict
    :param num:      The number of times to schedule the job
    """
    num = int(num)
    job = yaml.safe_dump(job_config)
    tube = job_config.pop('tube')
    beanstalk = teuthology.beanstalk.connect()
    beanstalk.use(tube)
    while num > 0:
        jid = beanstalk.put(
            job,
            ttr=60 * 60 * 24,
            priority=job_config['priority'],
        )
        print 'Job scheduled with name {name} and ID {jid}'.format(
            name=job_config['name'], jid=jid)
        job_config['job_id'] = str(jid)
        report.try_push_job_info(job_config, dict(status='queued'))
        num -= 1
示例#8
0
def schedule_job(job_config, num=1):
    """
    Schedule a job.

    :param job_config: The complete job dict
    :param num:      The number of times to schedule the job
    """
    num = int(num)
    job = yaml.safe_dump(job_config)
    tube = job_config.pop('tube')
    beanstalk = teuthology.beanstalk.connect()
    beanstalk.use(tube)
    while num > 0:
        jid = beanstalk.put(
            job,
            ttr=60 * 60 * 24,
            priority=job_config['priority'],
        )
        print 'Job scheduled with name {name} and ID {jid}'.format(
            name=job_config['name'], jid=jid)
        job_config['job_id'] = str(jid)
        report.try_push_job_info(job_config, dict(status='queued'))
        num -= 1
示例#9
0
def run_with_watchdog(process, job_config):
    job_start_time = datetime.utcnow()

    # Only push the information that's relevant to the watchdog, to save db
    # load
    job_info = dict(
        name=job_config['name'],
        job_id=job_config['job_id'],
    )

    # Sleep once outside of the loop to avoid double-posting jobs
    time.sleep(teuth_config.watchdog_interval)
    symlink_worker_log(job_config['worker_log'], job_config['archive_path'])
    while process.poll() is None:
        # Kill jobs that have been running longer than the global max
        run_time = datetime.utcnow() - job_start_time
        total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
        if total_seconds > teuth_config.max_job_time:
            log.warning("Job ran longer than {max}s. Killing...".format(
                max=teuth_config.max_job_time))
            kill_job(job_info['name'], job_info['job_id'],
                     teuth_config.archive_base, job_config['owner'])

        # calling this without a status just updates the jobs updated time
        report.try_push_job_info(job_info)
        time.sleep(teuth_config.watchdog_interval)

    # we no longer support testing theses old branches
    assert (job_config.get('teuthology_branch')
            not in ('argonaut', 'bobtail', 'cuttlefish', 'dumpling'))

    # Let's make sure that paddles knows the job is finished. We don't know
    # the status, but if it was a pass or fail it will have already been
    # reported to paddles. In that case paddles ignores the 'dead' status.
    # If the job was killed, paddles will use the 'dead' status.
    report.try_push_job_info(job_info, dict(status='dead'))
示例#10
0
def reimage(job_config):
    # Reimage the targets specified in job config
    # and update their keys in config after reimaging
    ctx = create_fake_context(job_config)
    # change the status during the reimaging process
    report.try_push_job_info(ctx.config, dict(status='waiting'))
    targets = job_config['targets']
    try:
        reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
    except Exception as e:
        log.exception('Reimaging error. Nuking machines...')
        # Reimage failures should map to the 'dead' status instead of 'fail'
        report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
        nuke(ctx, True)
        raise
    ctx.config['targets'] = reimaged
    # change the status to running after the reimaging process
    report.try_push_job_info(ctx.config, dict(status='running'))
示例#11
0
def main(args):
    verbose = args["--verbose"]
    archive = args["--archive"]
    owner = args["--owner"]
    config = args["<config>"]
    name = args["--name"]
    description = args["--description"]
    machine_type = args["--machine-type"]
    block = args["--block"]
    lock = args["--lock"]
    suite_path = args["--suite-path"]
    os_type = args["--os-type"]
    os_version = args["--os-version"]

    set_up_logging(verbose, archive)

    # print the command being ran
    log.debug("Teuthology command: {0}".format(get_teuthology_command(args)))

    if owner is None:
        args["--owner"] = owner = get_user()

    config = setup_config(config)

    if archive is not None and 'archive_path' not in config:
        config['archive_path'] = archive

    write_initial_metadata(archive, config, name, description, owner)
    report.try_push_job_info(config, dict(status='running'))

    machine_type = get_machine_type(machine_type, config)
    args["--machine-type"] = machine_type

    if block:
        assert lock, \
            'the --block option is only supported with the --lock option'

    log.info('\n  '.join([
        'Config:',
    ] + yaml.safe_dump(config, default_flow_style=False).splitlines()))

    args["summary"] = get_summary(owner, description)

    ceph_repo = config.get('repo')
    if ceph_repo:
        teuth_config.ceph_git_url = ceph_repo
    suite_repo = config.get('suite_repo')
    if suite_repo:
        teuth_config.ceph_qa_suite_git_url = suite_repo

    # overwrite the config values of os_{type,version} if corresponding
    # command-line arguments are provided
    if os_type:
        config["os_type"] = os_type
    if os_version:
        config["os_version"] = os_version

    config["tasks"] = validate_tasks(config)

    init_tasks = get_initial_tasks(lock, config, machine_type)

    # prepend init_tasks to the front of the task list
    config['tasks'][:0] = init_tasks

    if suite_path is not None:
        config['suite_path'] = suite_path

    # fetches the tasks and returns a new suite_path if needed
    config["suite_path"] = fetch_tasks_if_needed(config)

    # If the job has a 'use_shaman' key, use that value to override the global
    # config's value.
    if config.get('use_shaman') is not None:
        teuth_config.use_shaman = config['use_shaman']

    # create a FakeNamespace instance that mimics the old argparse way of doing
    # things we do this so we can pass it to run_tasks without porting those
    # tasks to the new way of doing things right now
    args["<config>"] = config
    fake_ctx = FakeNamespace(args)

    # store on global config if interactive-on-error, for contextutil.nested()
    # FIXME this should become more generic, and the keys should use
    # '_' uniformly
    if fake_ctx.config.get('interactive-on-error'):
        teuthology.config.config.ctx = fake_ctx

    try:
        run_tasks(tasks=config['tasks'], ctx=fake_ctx)
    finally:
        # print to stdout the results and possibly send an email on any errors
        report_outcome(config, archive, fake_ctx.summary, fake_ctx)
示例#12
0
def main(ctx):
    if ctx.owner is None:
        ctx.owner = 'scheduled_{user}'.format(user=get_user())
    read_config(ctx)

    beanstalk = teuthology.beanstalk.connect()

    tube = ctx.worker
    beanstalk.use(tube)

    if ctx.show:
        for job_id in ctx.show:
            job = beanstalk.peek(job_id)
            if job is None and ctx.verbose:
                print 'job {jid} is not in the queue'.format(jid=job_id)
            else:
                print '--- job {jid} priority {prio} ---\n'.format(
                    jid=job_id,
                    prio=job.stats()['pri']), job.body
        return

    if ctx.delete:
        for job_id in ctx.delete:
            job = beanstalk.peek(job_id)
            if job is None:
                print 'job {jid} is not in the queue'.format(jid=job_id)
            else:
                job.delete()
                name = yaml.safe_load(job.body).get('name')
                if name:
                    report.try_delete_jobs(name, job_id)
        return

    # strip out targets; the worker will allocate new ones when we run
    # the job with --lock.
    if ctx.config.get('targets'):
        del ctx.config['targets']

    job_config = dict(
        name=ctx.name,
        last_in_suite=ctx.last_in_suite,
        email=ctx.email,
        description=ctx.description,
        owner=ctx.owner,
        verbose=ctx.verbose,
        machine_type=ctx.worker,
    )
    # Merge job_config and ctx.config
    job_config.update(ctx.config)
    if ctx.timeout is not None:
        job_config['results_timeout'] = ctx.timeout

    job = yaml.safe_dump(job_config)
    num = ctx.num
    while num > 0:
        jid = beanstalk.put(
            job,
            ttr=60 * 60 * 24,
            priority=ctx.priority,
        )
        print 'Job scheduled with name {name} and ID {jid}'.format(
            name=ctx.name, jid=jid)
        job_config['job_id'] = str(jid)
        report.try_push_job_info(job_config, dict(status='queued'))
        num -= 1
示例#13
0
def main(args):
    # run dispatcher in job supervisor mode if --supervisor passed
    if args["--supervisor"]:
        return supervisor.main(args)

    verbose = args["--verbose"]
    tube = args["--tube"]
    log_dir = args["--log-dir"]
    archive_dir = args["--archive-dir"]

    if archive_dir is None:
        archive_dir = teuth_config.archive_base

    # setup logging for disoatcher in {log_dir}
    loglevel = logging.INFO
    if verbose:
        loglevel = logging.DEBUG
    log.setLevel(loglevel)
    log_file_path = os.path.join(log_dir, f"dispatcher.{tube}.{os.getpid()}")
    setup_log_file(log_file_path)
    install_except_hook()

    load_config(archive_dir=archive_dir)

    connection = beanstalk.connect()
    beanstalk.watch_tube(connection, tube)
    result_proc = None

    if teuth_config.teuthology_path is None:
        fetch_teuthology('master')
    fetch_qa_suite('master')

    keep_running = True
    while keep_running:
        # Check to see if we have a teuthology-results process hanging around
        # and if so, read its return code so that it can exit.
        if result_proc is not None and result_proc.poll() is not None:
            log.debug("teuthology-results exited with code: %s",
                      result_proc.returncode)
            result_proc = None

        if sentinel(restart_file_path):
            restart()
        elif sentinel(stop_file_path):
            stop()

        load_config()

        job = connection.reserve(timeout=60)
        if job is None:
            continue

        # bury the job so it won't be re-run if it fails
        job.bury()
        job_id = job.jid
        log.info('Reserved job %d', job_id)
        log.info('Config is: %s', job.body)
        job_config = yaml.safe_load(job.body)
        job_config['job_id'] = str(job_id)

        if job_config.get('stop_worker'):
            keep_running = False

        try:
            job_config, teuth_bin_path = prep_job(
                job_config,
                log_file_path,
                archive_dir,
            )
        except SkipJob:
            continue

        # lock machines but do not reimage them
        if 'roles' in job_config:
            job_config = lock_machines(job_config)

        run_args = [
            os.path.join(teuth_bin_path, 'teuthology-dispatcher'),
            '--supervisor',
            '-v',
            '--bin-path',
            teuth_bin_path,
            '--archive-dir',
            archive_dir,
        ]

        # Create run archive directory if not already created and
        # job's archive directory
        create_job_archive(job_config['name'], job_config['archive_path'],
                           archive_dir)
        job_config_path = os.path.join(job_config['archive_path'],
                                       'orig.config.yaml')

        # Write initial job config in job archive dir
        with open(job_config_path, 'w') as f:
            yaml.safe_dump(job_config, f, default_flow_style=False)

        run_args.extend(["--job-config", job_config_path])

        try:
            job_proc = subprocess.Popen(run_args)
            log.info('Job supervisor PID: %s', job_proc.pid)
        except Exception:
            error_message = "Saw error while trying to spawn supervisor."
            log.exception(error_message)
            if 'targets' in job_config:
                nuke(supervisor.create_fake_context(job_config), True)
            report.try_push_job_info(
                job_config, dict(status='fail', failure_reason=error_message))

        # This try/except block is to keep the worker from dying when
        # beanstalkc throws a SocketError
        try:
            job.delete()
        except Exception:
            log.exception("Saw exception while trying to delete job")
示例#14
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), 'reserve_machines must be integer'
    assert (reserved >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = teuthology.lock.query.list_locks(machine_type=machine_type,
                                                    up=True,
                                                    locked=False,
                                                    count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith(
                'scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more %s machines to be free (need %s + %s, have %s)...',
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (reserved, requested, len(machines)))

        try:
            newly_locked = teuthology.lock.ops.lock_many(
                ctx, requested, machine_type, ctx.owner, ctx.archive, os_type,
                os_version, arch)
        except Exception:
            # Lock failures should map to the 'dead' status instead of 'fail'
            set_status(ctx.summary, 'dead')
            raise
        all_locked.update(newly_locked)
        log.info('{newly_locked} {mtype} machines locked this try, '
                 '{total_locked}/{total_requested} locked so far'.format(
                     newly_locked=len(newly_locked),
                     mtype=machine_type,
                     total_locked=len(all_locked),
                     total_requested=total_requested,
                 ))
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if teuthology.lock.query.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if teuthology.lock.ops.do_update_keys(keys_dict)[0]:
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.keys():
                    stats = teuthology.lock.query.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = all_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'], default_flow_style=False).splitlines()
            log.info('\n  '.join([
                'Locked targets:',
            ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" \
                                  "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested))
        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (ctx.config.get('unlock_on_failure', False)
                             and not ctx.config.get('nuke-on-error', False))
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].keys():
                teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner,
                                               ctx.archive)
示例#15
0
def lock_machines(ctx, config):
    """
    Lock machines.  Called when the teuthology run finds and locks
    new machines.  This is not called if the one has teuthology-locked
    machines and placed those keys in the Targets section of a yaml file.
    """
    # It's OK for os_type and os_version to be None here.  If we're trying
    # to lock a bare metal machine, we'll take whatever is available.  If
    # we want a vps, defaults will be provided by misc.get_distro and
    # misc.get_distro_version in provision.create_if_vm
    os_type = ctx.config.get("os_type")
    os_version = ctx.config.get("os_version")
    arch = ctx.config.get('arch')
    log.info('Locking machines...')
    assert isinstance(config[0], int), 'config[0] must be an integer'
    machine_type = config[1]
    total_requested = config[0]
    # We want to make sure there are always this many machines available
    reserved = teuth_config.reserve_machines
    assert isinstance(reserved, int), 'reserve_machines must be integer'
    assert (reserved >= 0), 'reserve_machines should >= 0'

    # change the status during the locking process
    report.try_push_job_info(ctx.config, dict(status='waiting'))

    all_locked = dict()
    requested = total_requested
    while True:
        # get a candidate list of machines
        machines = teuthology.lock.query.list_locks(machine_type=machine_type, up=True,
                                                    locked=False, count=requested + reserved)
        if machines is None:
            if ctx.block:
                log.error('Error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                raise RuntimeError('Error listing machines')

        # make sure there are machines for non-automated jobs to run
        if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info(
                    'waiting for more %s machines to be free (need %s + %s, have %s)...',
                    machine_type,
                    reserved,
                    requested,
                    len(machines),
                )
                time.sleep(10)
                continue
            else:
                assert 0, ('not enough machines free; need %s + %s, have %s' %
                           (reserved, requested, len(machines)))

        try:
            newly_locked = teuthology.lock.ops.lock_many(ctx, requested, machine_type,
                                                         ctx.owner, ctx.archive, os_type,
                                                         os_version, arch)
        except Exception:
            # Lock failures should map to the 'dead' status instead of 'fail'
            set_status(ctx.summary, 'dead')
            raise
        all_locked.update(newly_locked)
        log.info(
            '{newly_locked} {mtype} machines locked this try, '
            '{total_locked}/{total_requested} locked so far'.format(
                newly_locked=len(newly_locked),
                mtype=machine_type,
                total_locked=len(all_locked),
                total_requested=total_requested,
            )
        )
        if len(all_locked) == total_requested:
            vmlist = []
            for lmach in all_locked:
                if teuthology.lock.query.is_vm(lmach):
                    vmlist.append(lmach)
            if vmlist:
                log.info('Waiting for virtual machines to come up')
                keys_dict = dict()
                loopcount = 0
                while len(keys_dict) != len(vmlist):
                    loopcount += 1
                    time.sleep(10)
                    keys_dict = misc.ssh_keyscan(vmlist)
                    log.info('virtual machine is still unavailable')
                    if loopcount == 40:
                        loopcount = 0
                        log.info('virtual machine(s) still not up, ' +
                                 'recreating unresponsive ones.')
                        for guest in vmlist:
                            if guest not in keys_dict.keys():
                                log.info('recreating: ' + guest)
                                full_name = misc.canonicalize_hostname(guest)
                                provision.destroy_if_vm(ctx, full_name)
                                provision.create_if_vm(ctx, full_name)
                if teuthology.lock.keys.do_update_keys(keys_dict)[0]:
                    log.info("Error in virtual machine keys")
                newscandict = {}
                for dkey in all_locked.iterkeys():
                    stats = teuthology.lock.query.get_status(dkey)
                    newscandict[dkey] = stats['ssh_pub_key']
                ctx.config['targets'] = newscandict
            else:
                ctx.config['targets'] = all_locked
            locked_targets = yaml.safe_dump(
                ctx.config['targets'],
                default_flow_style=False
            ).splitlines()
            log.info('\n  '.join(['Locked targets:', ] + locked_targets))
            # successfully locked machines, change status back to running
            report.try_push_job_info(ctx.config, dict(status='running'))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'
        else:
            requested = requested - len(newly_locked)
            assert requested > 0, "lock_machines: requested counter went" \
                                  "negative, this shouldn't happen"

        log.info(
            "{total} machines locked ({new} new); need {more} more".format(
                total=len(all_locked), new=len(newly_locked), more=requested)
        )
        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        # If both unlock_on_failure and nuke-on-error are set, don't unlock now
        # because we're just going to nuke (and unlock) later.
        unlock_on_failure = (
            ctx.config.get('unlock_on_failure', False)
            and not ctx.config.get('nuke-on-error', False)
        )
        if get_status(ctx.summary) == 'pass' or unlock_on_failure:
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner, ctx.archive)
示例#16
0
def main(ctx):
    if ctx.owner is None:
        ctx.owner = 'scheduled_{user}'.format(user=get_user())
    read_config(ctx)

    beanstalk = teuthology.beanstalk.connect()

    tube = ctx.worker
    beanstalk.use(tube)

    if ctx.show:
        for job_id in ctx.show:
            job = beanstalk.peek(job_id)
            if job is None and ctx.verbose:
                print 'job {jid} is not in the queue'.format(jid=job_id)
            else:
                print '--- job {jid} priority {prio} ---\n'.format(
                    jid=job_id, prio=job.stats()['pri']), job.body
        return

    if ctx.delete:
        for job_id in ctx.delete:
            job = beanstalk.peek(job_id)
            if job is None:
                print 'job {jid} is not in the queue'.format(jid=job_id)
            else:
                job.delete()
                name = yaml.safe_load(job.body).get('name')
                if name:
                    report.try_delete_jobs(name, job_id)
        return

    # strip out targets; the worker will allocate new ones when we run
    # the job with --lock.
    if ctx.config.get('targets'):
        del ctx.config['targets']

    job_config = dict(
        name=ctx.name,
        last_in_suite=ctx.last_in_suite,
        email=ctx.email,
        description=ctx.description,
        owner=ctx.owner,
        verbose=ctx.verbose,
        machine_type=ctx.worker,
    )
    # Merge job_config and ctx.config
    job_config.update(ctx.config)
    if ctx.timeout is not None:
        job_config['results_timeout'] = ctx.timeout

    job = yaml.safe_dump(job_config)
    num = ctx.num
    while num > 0:
        jid = beanstalk.put(
            job,
            ttr=60 * 60 * 24,
            priority=ctx.priority,
        )
        print 'Job scheduled with name {name} and ID {jid}'.format(
            name=ctx.name, jid=jid)
        job_config['job_id'] = str(jid)
        report.try_push_job_info(job_config, dict(status='queued'))
        num -= 1