예제 #1
0
def handle_restarted_experiment(experiment):
    """If experiment is a restart, we should resume from last check point"""
    try:
        publisher.publish_log(
            log_line='Copying outputs from experiment `{}` into experiment `{}`'
            .format(experiment.original_experiment.unique_name,
                    experiment.unique_name),
            status=ExperimentLifeCycle.BUILDING,
            experiment_uuid=experiment.uuid.hex,
            experiment_name=experiment.unique_name,
            job_uuid='all',
            persist=True)
        copy_experiment_outputs(experiment.original_experiment.unique_name,
                                experiment.unique_name)

    except OSError:
        publisher.publish_log(
            log_line=
            'Could not copy the outputs of experiment `{}` into experiment `{}`'
            .format(experiment.original_experiment.unique_name,
                    experiment.unique_name),
            status=ExperimentLifeCycle.BUILDING,
            experiment_uuid=experiment.uuid.hex,
            experiment_name=experiment.unique_name,
            job_uuid='all',
            persist=True)
        logger.warning(
            'Could not copy the outputs of experiment `{}` into experiment `{}`'
            .format(experiment.original_experiment.unique_name,
                    experiment.unique_name))
예제 #2
0
 def _handle_logs(self, log_line):
     publisher.publish_log(
         log_line=log_line,
         status=ExperimentLifeCycle.BUILDING,
         experiment_uuid=self.experiment_uuid,
         experiment_name=self.experiment_name,
         job_uuid='all',
     )
예제 #3
0
 def _handle_logs(self, log_line):
     publisher.publish_log(
         log_line=log_line,
         status=ExperimentLifeCycle.BUILDING,
         experiment_uuid=self.experiment_uuid,
         experiment_name=self.experiment_name,
         job_uuid='all',
         persist=False  # TODO: ADD log persistence
     )
예제 #4
0
    def build(self, memory_limit=None):
        # Checkout to the correct commit
        git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag)

        limits = {
            # Always disable memory swap for building, since mostly
            # nothing good can come of that.
            'memswap': -1
        }
        if memory_limit:
            limits['memory'] = memory_limit

        # Create DockerFile
        with open(self.dockerfile_path, 'w') as dockerfile:
            dockerfile.write(self.render())

        self.connect()
        check_pulse = 0
        for log_line in self.docker.build(
                path=self.build_path,
                tag='{}:{}'.format(self.image_name, self.image_tag),
                buildargs={},
                decode=True,
                forcerm=True,
                rm=True,
                pull=True,
                nocache=False,
                container_limits=limits,
                stream=True,
        ):
            check_pulse += 1
            publisher.publish_log(
                log_line=log_line,
                status=ExperimentLifeCycle.BUILDING,
                experiment_uuid=self.experiment_uuid,
                experiment_name=self.experiment_name,
                job_uuid='all',
                persist=False  # TODO: ADD log persistence
            )
            # Check if experiment is not stopped in the meanwhile
            if check_pulse > self.CHECK_INTERVAL:
                if not experiment_still_running(self.experiment_uuid):
                    logger.info(
                        'Experiment `{}` is not running, stopping build'.
                        format(self.experiment_uuid))
                    return False
                else:
                    check_pulse = 0

        # Checkout back to master
        git.checkout_commit(repo_path=self.repo_path)
        return True
예제 #5
0
def run(k8s_manager, pod_id, experiment_uuid, experiment_name, job_uuid,
        task_type, task_idx, container_job_name):
    raw = k8s_manager.k8s_api.read_namespaced_pod_log(
        pod_id,
        k8s_manager.namespace,
        container=container_job_name,
        follow=True,
        _preload_content=False)
    for log_line in raw.stream():
        publisher.publish_log(log_line=log_line,
                              status=ExperimentLifeCycle.RUNNING,
                              experiment_uuid=experiment_uuid,
                              experiment_name=experiment_name,
                              job_uuid=job_uuid,
                              task_type=task_type,
                              task_idx=task_idx)
예제 #6
0
    def push(self):
        # Build a progress setup for each layer, and only emit per-layer info every 1.5s
        layers = {}
        last_emit_time = time.time()
        self.connect()
        check_pulse = 0
        for log_line in self.docker.push(self.image_name,
                                         tag=self.image_tag,
                                         stream=True):
            lines = [l for l in log_line.decode('utf-8').split('\r\n') if l]
            lines = [json.loads(l) for l in lines]
            for progress in lines:
                if 'error' in progress:
                    logger.error(progress['error'], extra=dict(phase='failed'))
                    return
                if 'id' not in progress:
                    continue
                if 'progressDetail' in progress and progress['progressDetail']:
                    layers[progress['id']] = progress['progressDetail']
                else:
                    layers[progress['id']] = progress['status']
                if time.time() - last_emit_time > 1.5:
                    logger.debug('Pushing image\n',
                                 extra=dict(progress=layers, phase='pushing'))
                    last_emit_time = time.time()

                publisher.publish_log(
                    log_line=log_line,
                    status=ExperimentLifeCycle.BUILDING,
                    experiment_uuid=self.experiment_uuid,
                    experiment_name=self.experiment_name,
                    job_uuid='all',
                    persist=False  # TODO: ADD log persistence
                )

            # Check if experiment is not stopped in the meanwhile
            check_pulse += 1
            if check_pulse > self.CHECK_INTERVAL:
                if not experiment_still_running(self.experiment_uuid):
                    logger.info(
                        'Experiment `{}` is not running, stopping build'.
                        format(self.experiment_uuid))
                    return False
                else:
                    check_pulse = 0

        return True