Пример #1
0
    def _update_train_data_for_next_iteration(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS

        task_declaration.current_iteration += 1
        task_declaration.current_iteration_retry = 0

        task_declaration.progress = (
                task_declaration.current_iteration * task_declaration.epochs_in_iteration * 100
                / task_declaration.epochs)

        count_ta = 0
        for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)):
            train_data = ta.train_data
            # share data to worker
            train_data.set_encryption_key(ta.worker.enc_key)
            train_data.save()

            ta.state = TaskAssignment.State.TRAINING
            ta.save()
            count_ta += 1

        assert task_declaration.workers_requested == count_ta
        task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS
        task_declaration.save()
Пример #2
0
    def _process_verify_in_progress(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS
        verification_assignments = task_declaration.get_verification_assignments(
            states=(
                VerificationAssignment.State.VERIFYING,
                VerificationAssignment.State.FINISHED
            )
        )

        failed = False
        finished_verification_assignments = []
        count_timeout = 0
        with async_commit():
            for va in verification_assignments:
                if va.state == VerificationAssignment.State.VERIFYING:
                    if va.iteration_is_finished:
                        va.state = VerificationAssignment.State.FINISHED
                        va.save()

                if va.state == VerificationAssignment.State.FINISHED:
                    if va.verification_result.error:
                        failed = True
                    else:
                        finished_verification_assignments.append(va)
                    continue

                verify_timeout = settings.WAIT_VERIFY_TIMEOUT
                now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo)
                if (now - va.verification_result.modified_at).total_seconds() > verify_timeout:
                    va.state = VerificationAssignment.State.TIMEOUT
                    va.save()

                    logger.info('Timeout of waiting for {}'.format(va))
                    count_timeout += 1

        if count_timeout:
            task_declaration.verifiers_needed += count_timeout
            self._republish_for_verify(task_declaration)
            return

        if failed:
            logger.info('{} is failed'.format(task_declaration))
            task_declaration.state = TaskDeclaration.State.FAILED
            task_declaration.save()
            return

        if len(finished_verification_assignments) < task_declaration.verifiers_requested:
            # verification is not ready
            logger.info('Wait for finish of verification for {} iteration {}'.format(
                task_declaration, task_declaration.current_iteration))
            return

        fake_workers = self._parse_verification_results(
            task_declaration, finished_verification_assignments)

        if fake_workers:
            logger.info('Fake workers detected')
            fake_worker_ids = []
            for worker_id, count_detections in fake_workers.items():
                logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections))
                fake_worker_ids.append(worker_id)
            self._reject_fake_workers(task_declaration, fake_worker_ids)
            self._republish_for_train(task_declaration)
            return

        if not task_declaration.last_iteration:
            self._update_train_data_for_next_iteration(task_declaration)
            return

        task_declaration.progress = 100.0
        task_declaration.state = TaskDeclaration.State.COMPLETED
        task_declaration.save()
        logger.info('{} is finished tflops: {} estimated: {}'.format(
            task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))