def _process_deployment(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT with async_commit(): save = False for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.READY,)): if self._is_task_assignment_allowed(task_declaration, ta): ta.state = TaskAssignment.State.ACCEPTED ta.save() task_declaration.workers_needed -= 1 save = True else: ta.state = TaskAssignment.State.REJECTED ta.save() for va in task_declaration.get_verification_assignments(states=(VerificationAssignment.State.READY,)): if self._is_verification_assignment_allowed(task_declaration, va): va.state = VerificationAssignment.State.ACCEPTED va.save() task_declaration.verifiers_needed -= 1 save = True else: va.state = VerificationAssignment.State.REJECTED va.save() # save if were changes if save: task_declaration.save() ready_to_start = task_declaration.workers_needed == 0 and task_declaration.verifiers_needed == 0 logger.info('{} ready: {} workers_needed: {} verifiers_needed: {}'.format( task_declaration, ready_to_start, task_declaration.workers_needed, task_declaration.verifiers_needed)) if ready_to_start: self._assign_initial_train_data(task_declaration) return if not save: # recheck how many workers and verifiers really accepted accepted_workers_count = len(task_declaration.get_task_assignments( states=(TaskAssignment.State.ACCEPTED,))) accepted_verifiers_count = len(task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED,))) if accepted_workers_count == task_declaration.workers_requested \ and accepted_verifiers_count == task_declaration.verifiers_requested: logger.info('All performers are accepted, start train') task_declaration.workers_needed = 0 task_declaration.verifiers_needed = 0 self._assign_initial_train_data(task_declaration)
def _reassign_verification_data(self, task_declaration: TaskDeclaration): verification_assignments = task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.TIMEOUT) ) # split accepted and overdue accepted_verification_assignments = [] timeout_verification_assignments = [] for va in verification_assignments: if va.state == VerificationAssignment.State.ACCEPTED: accepted_verification_assignments.append(va) continue if va.state == VerificationAssignment.State.TIMEOUT: timeout_verification_assignments.append(va) continue assert False and 'Check query!' assert len(accepted_verification_assignments) == len(timeout_verification_assignments) train_results = [ { 'worker_id': ta.worker_id, 'result': ta.train_result.weights_ipfs } for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)) ] for index, va in enumerate(accepted_verification_assignments): assert va.verification_data_id is None verification_data = VerificationData.create( verification_assignment_id=va.asset_id, # share data with verifier public_key=va.verifier.enc_key, test_dir_ipfs=task_declaration.dataset.test_dir_ipfs, model_code_ipfs=task_declaration.train_model.code_ipfs, train_results=train_results, db=self.db, encryption=self.encryption ) va.verification_data_id = verification_data.asset_id va.state = VerificationAssignment.State.VERIFYING va.save() failed_va = timeout_verification_assignments[index] failed_va.state = VerificationAssignment.State.FORGOTTEN failed_va.save() task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.save()
def _republish_for_verify(self, task_declaration: TaskDeclaration): assert task_declaration.verifiers_needed > 0 task_declaration.state = TaskDeclaration.State.DEPLOYMENT_VERIFICATION task_declaration.save() verification_assignment = task_declaration.get_verification_assignments( states=(VerificationAssignment.State.REJECTED,) ) for va in verification_assignment: va.state = VerificationAssignment.State.REASSIGN # return back ownership va.save(recipients=va.verifier.address)
def _assign_verification_data(self, task_declaration: TaskDeclaration, task_assignments: ListTaskAssignments): train_results = [] for ta in task_assignments: train_results.append({ 'worker_id': ta.worker_id, 'result': ta.train_result.weights_ipfs }) task_declaration.tflops += ta.train_result.tflops for verification_assignment in task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.FINISHED)): if verification_assignment.state == VerificationAssignment.State.ACCEPTED: assert verification_assignment.verification_data_id is None verification_data = VerificationData.create( verification_assignment_id=verification_assignment.asset_id, # share data with verifier public_key=verification_assignment.verifier.enc_key, test_dir_ipfs=task_declaration.dataset.test_dir_ipfs, model_code_ipfs=task_declaration.train_model.code_ipfs, train_results=train_results, db=self.db, encryption=self.encryption ) verification_assignment.verification_data_id = verification_data.asset_id verification_assignment.state = VerificationAssignment.State.VERIFYING verification_assignment.save() continue if verification_assignment.state == VerificationAssignment.State.FINISHED: verification_data = verification_assignment.verification_data verification_data.train_results = train_results verification_data.save() verification_assignment.state = VerificationAssignment.State.VERIFYING verification_assignment.save() continue task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.save()
def _process_verify_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS verification_assignments = task_declaration.get_verification_assignments( states=( VerificationAssignment.State.VERIFYING, VerificationAssignment.State.FINISHED ) ) failed = False finished_verification_assignments = [] count_timeout = 0 with async_commit(): for va in verification_assignments: if va.state == VerificationAssignment.State.VERIFYING: if va.iteration_is_finished: va.state = VerificationAssignment.State.FINISHED va.save() if va.state == VerificationAssignment.State.FINISHED: if va.verification_result.error: failed = True else: finished_verification_assignments.append(va) continue verify_timeout = settings.WAIT_VERIFY_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo) if (now - va.verification_result.modified_at).total_seconds() > verify_timeout: va.state = VerificationAssignment.State.TIMEOUT va.save() logger.info('Timeout of waiting for {}'.format(va)) count_timeout += 1 if count_timeout: task_declaration.verifiers_needed += count_timeout self._republish_for_verify(task_declaration) return if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if len(finished_verification_assignments) < task_declaration.verifiers_requested: # verification is not ready logger.info('Wait for finish of verification for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return fake_workers = self._parse_verification_results( task_declaration, finished_verification_assignments) if fake_workers: logger.info('Fake workers detected') fake_worker_ids = [] for worker_id, count_detections in fake_workers.items(): logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections)) fake_worker_ids.append(worker_id) self._reject_fake_workers(task_declaration, fake_worker_ids) self._republish_for_train(task_declaration) return if not task_declaration.last_iteration: self._update_train_data_for_next_iteration(task_declaration) return task_declaration.progress = 100.0 task_declaration.state = TaskDeclaration.State.COMPLETED task_declaration.save() logger.info('{} is finished tflops: {} estimated: {}'.format( task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))