Пример #1
0
  async def remove(self, param_ids, hash_ids):
    """Remove job output data that match given IDs."""
    async with self.lock:
      trash_dir = os.path.join(self.path, 'trash')
      if not os.path.exists(trash_dir):
        os.mkdir(trash_dir)

      queue_state = await self.queue.get_state()

      # Keep symlinks related to started/queued jobs
      jobs = queue_state['started_jobs'] + queue_state['queued_jobs']

      param_ids = set(param_ids)
      param_ids -= set([get_param_id(job['param']) for job in jobs])
      param_ids -= set([get_param_id(job['param']) + '_tmp' for job in jobs])

      hash_ids = set(hash_ids)
      hash_ids -= set([get_hash_id(job['param']) for job in jobs])
      hash_ids -= set([get_hash_id(job['param']) + '_tmp' for job in jobs])

      removed_output = self._remove_job_output(trash_dir, param_ids, hash_ids)
      removed_output += self._remove_dangling_noref(trash_dir)
      # Second pass to ensure deleting "last" if needed
      removed_output += self._remove_dangling_noref(trash_dir)
      return removed_output
Пример #2
0
  def _get_state_fast(self):
    """Return the state. Parameter details are removed."""
    assert self.lock.locked()

    state = {
      'finished_jobs': None,
      'started_jobs': None,
      'queued_jobs': None,
      'concurrency': self._state['concurrency'],
      'next_job_id': self._state['next_job_id'],
    }

    for key in ['finished_jobs', 'started_jobs', 'queued_jobs']:
      jobs = []
      for job in self._state[key].values():
        job = dict(job)
        param = job['param']
        job['param'] = {
          '_': {
            'param_id': get_param_id(param),
            'hash_id': get_hash_id(param),
          },
          '_name': get_name(param),
        }
        jobs.append(job)
      state[key] = jobs

    return state
Пример #3
0
    async def filter_augment(self, params):
        """Augment parameters with their history."""
        hash_ids = [get_hash_id(param) for param in params]

        history_list = await self.history.history_list(hash_ids)

        for param, history in zip(params, history_list):
            param['_'].update(history)

        return params
Пример #4
0
  async def _paramset_migrate(self, old_paramset, new_paramset):
    old_param_ids = await self.client.registry.paramset(old_paramset)
    new_param_ids = await self.client.registry.paramset(new_paramset)

    old_hash_ids = [get_hash_id(param) for param
                    in await self.client.registry.params(old_param_ids)]
    new_hash_ids = [get_hash_id(param) for param
                    in await self.client.registry.params(new_param_ids)]

    if len(old_param_ids) != len(new_param_ids):
      raise RuntimeError('Two parameter sets must have the same number of parameters')

    param_id_pairs = list(zip(old_param_ids, new_param_ids))
    hash_id_pairs = list(zip(old_hash_ids, new_hash_ids))

    migrated_param_id_pairs, migrated_hash_id_pairs = (
      await self.client.output.migrate(param_id_pairs, hash_id_pairs))
    migrated_hash_id_pairs = await self.client.history.migrate(hash_id_pairs)
    print(f'Migrated: ' +
          f'{len(migrated_param_id_pairs) + len(migrated_hash_id_pairs)} runner data, ' +
          f'{len(migrated_hash_id_pairs)} history data')
Пример #5
0
    async def update(self, job):
        """Record the job execution result."""
        hash_id = get_hash_id(job['param'])

        hist_data = {}
        for key in HISTORY_STUB:
            hist_data[key] = job[key]

        async with self.lock:
            self._state[hash_id] = hist_data

            self.logger.info(f'Updated history for parameter {hash_id}')
            self.lock.notify_all()
            self._schedule_dump()
Пример #6
0
  def _construct_env(job, job_paths):
    """Construct environment variables."""
    param = job['param']

    env = dict(os.environ)

    env['EXPTOOLS_JOB_DIR'] = job_paths['job_dir']
    env['EXPTOOLS_JOB_ID'] = job['job_id']
    env['EXPTOOLS_PARAM_ID'] = get_param_id(param)
    env['EXPTOOLS_HASH_ID'] = get_hash_id(param)
    env['EXPTOOLS_NAME'] = get_name(param)
    env['EXPTOOLS_CWD'] = get_cwd(param) or os.getcwd()
    env['EXPTOOLS_RETRY'] = str(get_retry(param))
    env['EXPTOOLS_RETRY_DELAY'] = str(get_retry_delay(param))
    env['EXPTOOLS_TIME_LIMIT'] = str(get_time_limit(param))

    env['EXPTOOLS_JOB_JSON_PATH'] = job_paths['job.json']
    env['EXPTOOLS_PARAM_JSON_PATH'] = job_paths['param.json']
    env['EXPTOOLS_RESOURCES_JSON_PATH'] = job_paths['resources.json']

    env['EXPTOOLS_STATUS_JSON_PATH'] = job_paths['status.json']
    return env
Пример #7
0
    async def _remove(self, paramset, param_ids):
        """Remove parameters from a parameter set."""
        assert self.lock.locked()

        if paramset not in self._state['paramsets']:
            raise RuntimeError(f'Parameter set does not exist: {paramset}')

        # Ensure not to delete parameters in a different parameter set
        paramset_param_ids = set(self._state['paramsets'][paramset])
        for param_id in param_ids:
            if param_id not in paramset_param_ids:
                raise RuntimeError(
                    f'Parameter {param_id} is not in {paramset}')

        for param_id in param_ids:
            param = self._state['params'][param_id]

            hash_id = get_hash_id(param)
            pos = self._hash_id_index[hash_id].index(param_id)
            assert pos != -1
            del self._hash_id_index[hash_id][pos]
            if not self._hash_id_index[hash_id]:
                del self._hash_id_index[hash_id]

            del self._state['params'][param_id]

        param_ids_set = set(param_ids)
        self._state['paramsets'][paramset] = [
            param_id for param_id in self._state['paramsets'][paramset]
            if param_id not in param_ids_set
        ]

        self.logger.info(
            f'Removed {len(param_ids)} parameters from {paramset}')
        self.lock.notify_all()
        self._schedule_dump()
        return param_ids
Пример #8
0
    async def _make_fid_mapping(self, history_map, hash_ids):
        """Make a value to feature index mapping,
    and make a feature vector list for existig parameters."""
        param_ids_list = await self.registry.param_ids_by_hash_ids(hash_ids)
        param_ids = [param_ids[0] for param_ids in param_ids_list if param_ids]
        params = await self.registry.params(param_ids)

        fid_mapping = {}
        fvec_map = {}

        succeeded_params = []
        for param in params:
            hash_id = get_hash_id(param)

            if hash_id not in history_map:
                assert False
            if not history_map[hash_id]['succeeded']:
                assert False
            if history_map[hash_id]['duration'] is None:
                assert False

            for key, value in param.items():
                if key.startswith('_'):
                    continue
                if isinstance(value, dict):
                    continue

                value_hash = hash(key + '___' + str(value))
                if value_hash not in fid_mapping:
                    fid_mapping[value_hash] = len(fid_mapping)

            succeeded_params.append((hash_id, param))

        for hash_id, param in succeeded_params:
            fvec_map[hash_id] = self._extract_fvec(fid_mapping, param)

        return fid_mapping, fvec_map
Пример #9
0
 async def get_hash_ids(self, params):
     """Return hash IDs."""
     return [get_hash_id(param) for param in params]
Пример #10
0
    async def filter_omit(self, params, types):
        """Omit parameters of specified types"""
        valid_types = [
            'succeeded', 'failed', 'finished', 'started', 'queued',
            'identical', 'duplicate', 'has_output'
        ]
        for type_ in types:
            assert type_ in valid_types

        hash_ids = [get_hash_id(param) for param in params]

        if 'succeeded' in types:
            params = [
                param for param in params if param['_']['succeeded'] != True
            ]

        if 'failed' in types:
            params = [
                param for param in params if param['_']['succeeded'] != False
            ]

        if 'finished' in types:
            params = [
                param for param in params if param['_']['finished'] is None
            ]

        if 'started' in types:
            started_jobs = await self.queue.jobs(await self.queue.job_ids(
                ['started']))
            started_job_hash_ids = set(
                [get_hash_id(job['param']) for job in started_jobs])

            new_params = []
            for param, hash_id in zip(params, hash_ids):
                if hash_id in started_job_hash_ids:
                    continue
                new_params.append(param)
            params = new_params

        if 'queued' in types:
            queued_jobs = await self.queue.jobs(await
                                                self.queue.job_ids(['queued']))
            queued_job_hash_ids = set(
                [get_hash_id(job['param']) for job in queued_jobs])

            new_params = []
            for param, hash_id in zip(params, hash_ids):
                if hash_id in queued_job_hash_ids:
                    continue
                new_params.append(param)
            params = new_params

        if 'identical' in types:
            seen_unique_ids = set()
            new_params = []
            for param in params:
                unique_id = make_unique_id(param)
                if unique_id not in seen_unique_ids:
                    seen_unique_ids.add(unique_id)
                    new_params.append(param)
            params = new_params

        if 'duplicate' in types:
            seen_hash_ids = set()
            new_params = []
            for param in params:
                hash_id = get_hash_id(param)
                if hash_id not in seen_hash_ids:
                    seen_hash_ids.add(hash_id)
                    new_params.append(param)
            params = new_params

        if 'has_output' in types:
            output_hash_ids = await self.runner.hash_ids()
            new_params = []
            for param in params:
                hash_id = get_hash_id(param)
                if hash_id not in output_hash_ids:
                    new_params.append(param)
            params = new_params

        return params
Пример #11
0
    async def estimate_remaining_time(self, state, oneshot, use_similar):
        """Estimate the remaining time using the queue state."""

        now = utcnow()

        epsilon = 0.001  # Potential underestimation until the progress reaches 0.1%

        # Future parallelism cannot be higher than the remaining job count
        concurrency = max(
            1.,
            min(state['concurrency'],
                len(state['started_jobs']) + len(state['queued_jobs'])))

        hash_ids = await self.history.hash_ids()
        history_list = await self.history.history_list(hash_ids)
        history_map = dict(zip(hash_ids, history_list))

        if use_similar:
            fid_mapping, fvec_map = await self._make_fid_mapping(
                history_map, hash_ids)

        # Estimate average per-job duration
        known_hash_ids = set()
        known_duration = 0.
        known_count = 0

        # Consider recent jobs first (in case some jobs have duplicate hash_id)
        for job in reversed(state['started_jobs']):
            hash_id = get_hash_id(job['param'])
            if hash_id in known_hash_ids:
                continue
            known_hash_ids.add(hash_id)

            if job['started'] is None:
                started = now
            else:
                started = parse_utc(job['started'])

            if job.get('status',
                       None) and job['status'].get('progress') >= epsilon:
                known_duration += diff_sec(now,
                                           started) / job['status']['progress']
                known_count += 1

        for hash_id, history in history_map.items():
            if hash_id in known_hash_ids:
                continue
            known_hash_ids.add(hash_id)

            if history['duration'] is not None:
                known_duration += history['duration']
                known_count += 1

        avg_duration = known_duration / max(known_count, 1)

        remaining_time_map = {}

        for job in state['finished_jobs']:
            remaining_time_map[job['job_id']] = 0.

        # Calculate started jobs' remaining time
        remaining_duration = 0.
        for job in state['started_jobs']:
            hash_id = get_hash_id(job['param'])
            history = history_map.get(hash_id, None)

            if job['started'] is None:
                started = now
            else:
                started = parse_utc(job['started'])

            if job.get('status',
                       None) and job['status'].get('progress') >= epsilon:
                exp_duration = diff_sec(now,
                                        started) / job['status']['progress']
                remaining_duration += max(
                    exp_duration - diff_sec(now, started), 0.)
            elif history and history['duration'] is not None:
                remaining_duration += max(
                    history['duration'] - diff_sec(now, started), 0.)
            else:
                if use_similar:
                    exp_duration = (await self._find_closest_duration(
                        history_map, fid_mapping, fvec_map, job['param']))
                    if exp_duration is None:
                        exp_duration = avg_duration
                else:
                    exp_duration = avg_duration

                remaining_duration += max(
                    exp_duration - diff_sec(now, started), 0.)

            # Take into account concurrency
            remaining_time_map[
                job['job_id']] = remaining_duration / concurrency

        # Calculate queued jobs' remaining time
        if not oneshot:
            for job in state['queued_jobs']:
                hash_id = get_hash_id(job['param'])
                history = history_map.get(hash_id, None)

                if history and history['duration'] is not None:
                    remaining_duration += history['duration']
                else:
                    if use_similar:
                        exp_duration = (await self._find_closest_duration(
                            history_map, fid_mapping, fvec_map, job['param']))
                        if exp_duration is None:
                            exp_duration = avg_duration
                    else:
                        exp_duration = avg_duration
                    remaining_duration += exp_duration

                # Take into account concurrency
                remaining_time_map[
                    job['job_id']] = remaining_duration / concurrency
        else:
            for job in state['queued_jobs']:
                remaining_time_map[
                    job['job_id']] = remaining_duration / concurrency

        # Take into account concurrency
        remaining_time = remaining_duration / concurrency

        return remaining_time, remaining_time_map
Пример #12
0
  async def _handle_status(self):
    """show the queue state"""
    limit = self.args.limit
    use_similar = self.args.similar

    if self.args.job_types == 'all':
      job_types = {'finished', 'started', 'queued'}
    else:
      job_types = set(self.args.job_types)

    estimator = Estimator(self.client.registry, self.client.history)
    use_color = self.common_args.color == 'yes'

    if use_color:
      colored = termcolor.colored
    else:
      def colored(s, *args, **kwargs):  # pylint: disable=invalid-name,unused-argument
        """Use no color."""
        return s

    async for queue_state in self._get_queue_state():
      oneshot = await self.client.scheduler.is_oneshot()

      output = ''

      all_jobs = (queue_state['finished_jobs'] +
                  queue_state['started_jobs'] +
                  queue_state['queued_jobs'])
      all_params = [job['param'] for job in all_jobs]

      job_id_max_len = self._get_job_id_max_len(all_jobs)
      param_id_max_len = self._get_param_id_max_len(all_params)

      if 'finished' in job_types:
        succeeded_count = len([job for job in queue_state['finished_jobs'] if job['succeeded']])
        failed_count = len(queue_state['finished_jobs']) - succeeded_count
        finished_jobs_color = 'red' if failed_count else 'green'
        output += colored(
          f"Finished jobs (S:{succeeded_count} / F:{failed_count})",
          finished_jobs_color, attrs=['reverse']) + '\n'

        if limit and len(queue_state['finished_jobs']) > limit:
          line = colored('  ', finished_jobs_color, attrs=['reverse'])
          output += line + ' ...\n'

        jobs = queue_state['finished_jobs']
        if limit:
          jobs = jobs[-limit:]

        for job in jobs:
          if job['succeeded']:
            line = colored('  ', 'green', attrs=['reverse'])
          else:
            line = colored('  ', 'red', attrs=['reverse'])

          param_id = get_param_id(job['param'])
          hash_id = get_hash_id(job['param'])
          name = get_name(job['param'])

          line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}"
          line += f' [{format_sec_short(job_elapsed_time(job)):>7}]'
          if job['succeeded']:
            line += ' succeeded  '
          else:
            line += ' FAILED     '
          line += f"{name}"
          output += line + '\n'

        output += '\n'

      remaining_time, rem_map = (
        await estimator.estimate_remaining_time(queue_state, False, use_similar))
      last_rem = 0.

      if 'started' in job_types:
        output += colored(
          f"Started jobs (A:{len(queue_state['started_jobs'])})",
          'cyan', attrs=['reverse']) + '\n'

        if limit and len(queue_state['started_jobs']) > limit:
          line = colored('  ', 'cyan', attrs=['reverse'])
          output += line + ' ...\n'

        jobs = queue_state['started_jobs']
        if limit:
          jobs = jobs[-limit:]

        for job in jobs:
          rem = rem_map[job['job_id']]

          param_id = get_param_id(job['param'])
          hash_id = get_hash_id(job['param'])
          name = get_name(job['param'])

          line = colored('  ', 'cyan', attrs=['reverse'])
          line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}"
          line += f' [{format_sec_short(job_elapsed_time(job)):>7}]'
          line += f'+[{format_sec_short(max(rem - last_rem, 0)):>7}]'
          line += '  '
          last_rem = rem
          line += f"{name}"
          output += line + '\n'

        output += '\n'

      if 'queued' in job_types:
        output += colored(
          f"Queued jobs (Q:{len(queue_state['queued_jobs'])})",
          'blue', attrs=['reverse']) + '  '

        output += 'Scheduler: '
        if oneshot:
          output += colored('Oneshot', 'blue')
        elif await self.client.scheduler.is_running():
          output += colored('Running', 'cyan')
        else:
          output += colored('Stopped', 'red')
        output += '\n'

        jobs = queue_state['queued_jobs']
        if limit:
          jobs = jobs[:limit]

        for job in jobs:
          rem = rem_map[job['job_id']]

          param_id = get_param_id(job['param'])
          hash_id = get_hash_id(job['param'])
          name = get_name(job['param'])

          line = colored('  ', 'blue', attrs=['reverse'])
          line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}"
          line += f'           [{format_sec_short(max(rem - last_rem, 0)):>7}]'
          line += '  '
          last_rem = rem
          line += f"{name}"
          output += line + '\n'

        if limit and len(queue_state['queued_jobs']) > limit:
          line = colored('  ', 'blue', attrs=['reverse'])
          output += line + ' ...\n'

        output += '\n'

      # output += f"Concurrency: {queue_state['concurrency']}"

      if not oneshot:
        # reuse remaining_time
        pass
      else:
        remaining_time, _ = await estimator.estimate_remaining_time(queue_state, True, use_similar)
      output += await format_estimated_time(remaining_time, queue_state, use_color) + '\n'

      if self.args.clear_screen:
        os.system('clear')
      print(output)

      if (self.args.stop_empty and
          not queue_state['started_jobs'] and (oneshot or not queue_state['queued_jobs'])):
        break
Пример #13
0
  async def _try(self, job, job_id, param, current_retry):
    """Run a job."""

    param_id = get_param_id(param)
    hash_id = get_hash_id(param)

    name = get_name(param)
    expanded_command = [arg.format(**param) for arg in get_command(param)]
    cwd = get_cwd(param) or os.getcwd()
    time_limit = get_time_limit(param)

    succeeded = False

    try:
      self.logger.info(f'Launching job {job_id}: {name}')

      job_paths = await self.output.make_job_directory(job, current_retry)
      job_paths = await self.output.create_job_files(job, job_paths)

      env = self._construct_env(job, job_paths)

      with self.output.open_job_stdio(job_paths) as stdio:
        stdout, stderr = stdio

        await self.output.make_tmp_symlinks(param_id, hash_id, job_paths)

        # Launch process
        proc = await asyncio.create_subprocess_exec(
          *expanded_command,
          cwd=cwd,
          stdin=asyncio.subprocess.DEVNULL,
          stdout=stdout,
          stderr=stderr,
          env=env,
          loop=self.loop)

        await self.queue.set_pid(job_id, proc.pid)

        # Watch status changes
        status_task = asyncio.ensure_future(
          self._watch_status(job_id, job_paths), loop=self.loop)

        try:
          if time_limit <= 0:
            await proc.communicate()
          else:
            await asyncio.wait_for(proc.communicate(), time_limit, loop=self.loop)

        except asyncio.TimeoutError:
          self.logger.error(f'Timeout while waiting for job {job_id}')

        finally:
          status_task.cancel()
          try:
            await status_task
          except concurrent.futures.CancelledError:
            # Ignore CancelledError because we caused it
            pass

          if proc.returncode is None:
            try:
              proc.send_signal(signal.SIGTERM)
            except Exception:  # pylint: disable=broad-except
              self.logger.exception('Exception while killing process')

            try:
              await asyncio.wait_for(proc.wait(), 10, loop=self.loop)
            except Exception:  # pylint: disable=broad-except
              self.logger.exception('Exception while waiting for process')

          if proc.returncode is None:
            try:
              proc.send_signal(signal.SIGKILL)
            except Exception:  # pylint: disable=broad-except
              self.logger.exception('Exception while killing process')

            try:
              await proc.wait()
            except Exception:  # pylint: disable=broad-except
              self.logger.exception('Exception while waiting for process')

      # Read status before making the job finished
      await self._read_status(job_id, job_paths)

      if proc.returncode == 0:
        await self.output.make_symlinks(param_id, hash_id, job_paths)

        succeeded = True

    except concurrent.futures.CancelledError:
      # Pass through
      raise

    except Exception:  # pylint: disable=broad-except
      self.logger.exception(f'Exception while running job {job_id}')

    finally:
      await self.output.remove_tmp_symlinks(param_id, hash_id)

    return succeeded