Exemplo n.º 1
0
    def create(cls, data, metadata, recipients, db):
        prepared_create_tx = db.bdb.transactions.prepare(
            operation='CREATE',
            signers=db.kp.public_key,
            asset={'data': data},
            recipients=recipients,
            metadata=metadata)

        fulfilled_create_tx = db.bdb.transactions.fulfill(
            transaction=prepared_create_tx, private_keys=db.kp.private_key)

        logger.debug('Fulfill CREATE tx {} for asset {}'.format(
            fulfilled_create_tx['id'], data['asset_name']))

        asset_id = fulfilled_create_tx['id']

        # check is asset already created
        logger.debug("Check is asset already created: {}".format(asset_id))
        txs = db.bdb.transactions.get(asset_id=asset_id)
        if len(txs):
            logger.debug("Asset already exists: {}".format(asset_id))
            asset = cls(asset_id=asset_id, transactions=txs, db=db)
            asset._update_if_were_changes(metadata, recipients)
            return asset, False

        from tatau_core.db.db import async_commit
        ac = async_commit()
        if ac. async:
            db.bdb.transactions.send_async(fulfilled_create_tx)
            ac.add_tx_id(fulfilled_create_tx['id'])
        else:
            db.bdb.transactions.send_commit(fulfilled_create_tx)
        return cls(asset_id=fulfilled_create_tx['id'],
                   transactions=[fulfilled_create_tx],
                   db=db), True
Exemplo n.º 2
0
    def _process_epoch_in_progress(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.EPOCH_IN_PROGRESS
        task_assignments = task_declaration.get_task_assignments(
            states=(
                TaskAssignment.State.TRAINING,
                TaskAssignment.State.FINISHED,
            )
        )

        failed = False
        finished_task_assignments = []
        count_timeout = 0
        with async_commit():
            for ta in task_assignments:
                if ta.state == TaskAssignment.State.TRAINING:
                    if ta.iteration_is_finished:
                        ta.state = TaskAssignment.State.FINISHED
                        ta.save()

                if ta.state == TaskAssignment.State.FINISHED:
                    if ta.train_result.error:
                        failed = True
                    else:
                        finished_task_assignments.append(ta)
                    continue

                train_timeout = settings.WAIT_TRAIN_TIMEOUT
                now = datetime.datetime.utcnow().replace(tzinfo=ta.train_result.modified_at.tzinfo)
                if (now - ta.train_result.modified_at).total_seconds() > train_timeout:
                    ta.state = TaskAssignment.State.TIMEOUT
                    ta.save()

                    logger.info('Timeout of waiting for {}'.format(ta))
                    count_timeout += 1

        if failed:
            logger.info('{} is failed'.format(task_declaration))
            task_declaration.state = TaskDeclaration.State.FAILED
            task_declaration.save()
            return

        if count_timeout:
            task_declaration.workers_needed += count_timeout
            self._republish_for_train(task_declaration)
            return

        if len(finished_task_assignments) < task_declaration.workers_requested:
            logger.info('Wait for finish of training for {} iteration {}'.format(
                task_declaration, task_declaration.current_iteration))
            return

        if task_declaration.current_iteration > 1:
            self._save_loss_and_accuracy(task_declaration, finished_task_assignments)

        self._assign_verification_data(task_declaration, finished_task_assignments)
Exemplo n.º 3
0
    def _process_deployment(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT

        with async_commit():
            save = False
            for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.READY,)):
                if self._is_task_assignment_allowed(task_declaration, ta):
                    ta.state = TaskAssignment.State.ACCEPTED
                    ta.save()

                    task_declaration.workers_needed -= 1
                    save = True
                else:
                    ta.state = TaskAssignment.State.REJECTED
                    ta.save()

            for va in task_declaration.get_verification_assignments(states=(VerificationAssignment.State.READY,)):
                if self._is_verification_assignment_allowed(task_declaration, va):
                    va.state = VerificationAssignment.State.ACCEPTED
                    va.save()

                    task_declaration.verifiers_needed -= 1
                    save = True
                else:
                    va.state = VerificationAssignment.State.REJECTED
                    va.save()

            # save if were changes
            if save:
                task_declaration.save()

        ready_to_start = task_declaration.workers_needed == 0 and task_declaration.verifiers_needed == 0
        logger.info('{} ready: {} workers_needed: {} verifiers_needed: {}'.format(
            task_declaration, ready_to_start, task_declaration.workers_needed, task_declaration.verifiers_needed))

        if ready_to_start:
            self._assign_initial_train_data(task_declaration)
            return

        if not save:
            # recheck how many workers and verifiers really accepted
            accepted_workers_count = len(task_declaration.get_task_assignments(
                states=(TaskAssignment.State.ACCEPTED,)))

            accepted_verifiers_count = len(task_declaration.get_verification_assignments(
                states=(VerificationAssignment.State.ACCEPTED,)))

            if accepted_workers_count == task_declaration.workers_requested \
                    and accepted_verifiers_count == task_declaration.verifiers_requested:
                logger.info('All performers are accepted, start train')
                task_declaration.workers_needed = 0
                task_declaration.verifiers_needed = 0
                self._assign_initial_train_data(task_declaration)
Exemplo n.º 4
0
    def _process_estimate_is_in_progress(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_IN_PROGRESS

        estimation_assignments = task_declaration.get_estimation_assignments(
            states=(
                EstimationAssignment.State.ESTIMATING,
                EstimationAssignment.State.FINISHED
            )
        )

        finished_assignments = []
        count_timeout = 0
        with async_commit():
            for ea in estimation_assignments:
                if ea.state == EstimationAssignment.State.ESTIMATING:
                    if ea.estimation_result.state == EstimationResult.State.FINISHED:
                        ea.state = EstimationAssignment.State.FINISHED
                        ea.save()
                    else:
                        estimate_timeout = settings.WAIT_ESTIMATE_TIMEOUT
                        now = datetime.datetime.utcnow().replace(tzinfo=ea.estimation_result.modified_at.tzinfo)
                        if (now - ea.estimation_result.modified_at).total_seconds() > estimate_timeout:
                            ea.state = EstimationAssignment.State.TIMEOUT
                            ea.save()

                            logger.info('Timeout of waiting for {}'.format(ea))
                            count_timeout += 1

                if ea.state == EstimationAssignment.State.FINISHED:
                    finished_assignments.append(ea)

        if count_timeout:
            task_declaration.estimators_needed += count_timeout
            self._republish_for_estimation(task_declaration)
            return

        if len(finished_assignments) == task_declaration.estimators_requested:
            task_declaration.state = TaskDeclaration.State.ESTIMATED
            task_declaration.estimated_tflops, failed = Estimator.estimate(task_declaration, finished_assignments)
            if failed:
                logger.info('{} is failed'.format(task_declaration))
                task_declaration.state = TaskDeclaration.State.FAILED
            task_declaration.save()
            return

        logger.info('Wait of finish for estimation {}, finished: {}, requested: {}'.format(
            task_declaration, len(finished_assignments), task_declaration.estimators_requested
        ))
Exemplo n.º 5
0
    def save(self, metadata, recipients):
        previous_tx = self.last_tx

        # we cant create tx if previous tx was not committed
        while not self.db.bdb.blocks.get(txid=previous_tx['id']):
            logger.debug('Previous tx is not committed, waiting...')
            time.sleep(1)

        output_index = 0
        output = previous_tx['outputs'][output_index]

        transfer_input = {
            'fulfillment': output['condition']['details'],
            'fulfills': {
                'output_index': output_index,
                'transaction_id': previous_tx['id'],
            },
            'owners_before': output['public_keys'],
        }

        prepared_transfer_tx = self.db.bdb.transactions.prepare(
            operation='TRANSFER',
            asset={'id': self.asset_id},
            inputs=transfer_input,
            recipients=recipients or self.db.kp.public_key,
            metadata=metadata,
        )

        fulfilled_transfer_tx = self.db.bdb.transactions.fulfill(
            prepared_transfer_tx,
            private_keys=self.db.kp.private_key,
        )

        logger.debug('Fulfill TRANSFER tx {} for asset {}'.format(
            fulfilled_transfer_tx['id'], self.data['asset_name']))
        from tatau_core.db.db import async_commit
        ac = async_commit()
        if ac. async:
            self.db.bdb.transactions.send_async(fulfilled_transfer_tx)
            ac.add_tx_id(fulfilled_transfer_tx['id'])
        else:
            self.db.bdb.transactions.send_commit(fulfilled_transfer_tx)
        self._transactions.append(fulfilled_transfer_tx)
Exemplo n.º 6
0
    def _process_estimate_is_required(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_REQUIRED

        with async_commit():

            save = False

            for ea in task_declaration.get_estimation_assignments(states=(EstimationAssignment.State.READY,)):
                if self._is_estimation_assignment_allowed(task_declaration, ea):
                    ea.state = EstimationAssignment.State.ACCEPTED
                    ea.save()
                    task_declaration.estimators_needed -= 1
                    save = True
                else:
                    ea.state = EstimationAssignment.State.REJECTED
                    ea.save()

            # save changes
            if save:
                task_declaration.save()

        if task_declaration.estimators_needed == 0:
            # in assign changes will be saved
            self._assign_estimate_data(task_declaration)
Exemplo n.º 7
0
    def _process_verify_in_progress(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS
        verification_assignments = task_declaration.get_verification_assignments(
            states=(
                VerificationAssignment.State.VERIFYING,
                VerificationAssignment.State.FINISHED
            )
        )

        failed = False
        finished_verification_assignments = []
        count_timeout = 0
        with async_commit():
            for va in verification_assignments:
                if va.state == VerificationAssignment.State.VERIFYING:
                    if va.iteration_is_finished:
                        va.state = VerificationAssignment.State.FINISHED
                        va.save()

                if va.state == VerificationAssignment.State.FINISHED:
                    if va.verification_result.error:
                        failed = True
                    else:
                        finished_verification_assignments.append(va)
                    continue

                verify_timeout = settings.WAIT_VERIFY_TIMEOUT
                now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo)
                if (now - va.verification_result.modified_at).total_seconds() > verify_timeout:
                    va.state = VerificationAssignment.State.TIMEOUT
                    va.save()

                    logger.info('Timeout of waiting for {}'.format(va))
                    count_timeout += 1

        if count_timeout:
            task_declaration.verifiers_needed += count_timeout
            self._republish_for_verify(task_declaration)
            return

        if failed:
            logger.info('{} is failed'.format(task_declaration))
            task_declaration.state = TaskDeclaration.State.FAILED
            task_declaration.save()
            return

        if len(finished_verification_assignments) < task_declaration.verifiers_requested:
            # verification is not ready
            logger.info('Wait for finish of verification for {} iteration {}'.format(
                task_declaration, task_declaration.current_iteration))
            return

        fake_workers = self._parse_verification_results(
            task_declaration, finished_verification_assignments)

        if fake_workers:
            logger.info('Fake workers detected')
            fake_worker_ids = []
            for worker_id, count_detections in fake_workers.items():
                logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections))
                fake_worker_ids.append(worker_id)
            self._reject_fake_workers(task_declaration, fake_worker_ids)
            self._republish_for_train(task_declaration)
            return

        if not task_declaration.last_iteration:
            self._update_train_data_for_next_iteration(task_declaration)
            return

        task_declaration.progress = 100.0
        task_declaration.state = TaskDeclaration.State.COMPLETED
        task_declaration.save()
        logger.info('{} is finished tflops: {} estimated: {}'.format(
            task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))
Exemplo n.º 8
0
    def _assign_initial_train_data(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT
        # start of train
        task_declaration.current_iteration += 1
        task_declaration.current_iteration_retry = 0

        accepted_task_assignment = task_declaration.get_task_assignments(states=(TaskAssignment.State.ACCEPTED,))

        count_ta = 0

        train_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.train_dir_ipfs).ls()
        test_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.test_dir_ipfs).ls()

        all_train_chunks_ipfs = self._chunk_it(
            iterable=[x.multihash for x in train_dirs_ipfs],
            count=task_declaration.workers_requested
        )

        assert len(all_train_chunks_ipfs) == task_declaration.workers_requested

        all_test_chunks_ipfs = self._chunk_it(
            iterable=[x.multihash for x in test_dirs_ipfs],
            count=task_declaration.workers_requested
        )

        assert len(all_test_chunks_ipfs) == task_declaration.workers_requested

        list_td_ta = []
        with async_commit():
            # create TrainData
            for index, task_assignment in enumerate(accepted_task_assignment):
                train_chunks_ipfs = all_train_chunks_ipfs[index]
                test_chunks_ipfs = all_test_chunks_ipfs[index]

                train_data = TrainData.create(
                    model_code_ipfs=task_declaration.train_model.code_ipfs,
                    train_chunks_ipfs=train_chunks_ipfs,
                    test_chunks_ipfs=test_chunks_ipfs,
                    data_index=index,
                    db=self.db,
                    encryption=self.encryption
                )

                list_td_ta.append((train_data, task_assignment))
                logger.debug('Created {}, train chunks: {}, count:{}, test chunks: {}, count:{}'.format(
                    train_data, train_chunks_ipfs, len(train_chunks_ipfs), test_chunks_ipfs, len(test_chunks_ipfs)))
                count_ta += 1

        assert task_declaration.workers_requested == count_ta

        with async_commit():
            # share to worker
            for train_data, task_assignment in list_td_ta:
                train_data.task_assignment_id = task_assignment.asset_id
                train_data.set_encryption_key(task_assignment.worker.enc_key)
                train_data.save()

                task_assignment.train_data_id = train_data.asset_id
                task_assignment.state = TaskAssignment.State.TRAINING
                task_assignment.save()

            task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS
            task_declaration.save()