def run(self, config: Dict[Text, Any]):
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on Kubernetes..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but exclude .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][
            keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_job(config)
示例#2
0
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files, self.tf_transform_output)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = self.model_fn(train_dataset, eval_dataset)

        model.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        model.train()
        for e in range(1, self.epoch + 1):
            epoch_loss = 0
            epoch_acc = 0
            step_count = 0
            for x, y in train_dataset:
                step_count += 1
                X_batch, y_batch = x.to(device), y.to(device)
                optimizer.zero_grad()
                y_pred = model(X_batch)

                loss = criterion(y_pred, y_batch)
                acc = binary_acc(y_pred, y_batch)

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

            print(f'Epoch {e + 0:03}: | Loss: '
                  f'{epoch_loss / step_count:.5f} | Acc: '
                  f'{epoch_acc / step_count:.3f}')

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            torch.save(model, temp_path)
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            torch.save(model, os.path.join(self.serving_model_dir, 'model.pt'))
示例#3
0
    def run(self, config: Dict[Text, Any]):
        """
        This run function essentially calls an underlying TFX orchestrator run.
        However it is meant as a higher level abstraction with some
        opinionated decisions taken.

        Args:
            config: a ZenML config dict
        """
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on GCP..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but excluse .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][
            keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_instance(config)
logger = get_logger(__name__)

# reset pipeline root to redirect to tests so that it writes the yamls there
ZENML_ROOT = str(Path(zenml.__path__[0]).parent)
TEST_ROOT = os.path.join(ZENML_ROOT, "tests")
Repository.init_repo(TEST_ROOT, analytics_opt_in=False)

pipeline_root = os.path.join(TEST_ROOT, "pipelines")
csv_root = os.path.join(TEST_ROOT, "test_data")
image_root = os.path.join(csv_root, "images")


repo: Repository = Repository.get_instance()
if path_utils.is_dir(pipeline_root):
    path_utils.rm_dir(pipeline_root)
repo.zenml_config.set_pipelines_dir(pipeline_root)

try:
    for i in range(1, 6):
        training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i))

        try:
            # Add a datasource. This will automatically track and version it.
            ds = CSVDatasource(name='my_csv_datasource',
                               path=os.path.join(csv_root, "my_dataframe.csv"))
        except AlreadyExistsException:
            ds = repo.get_datasource_by_name("my_csv_datasource")

        training_pipeline.add_datasource(ds)
示例#5
0
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files, self.tf_transform_output)

        class LitModel(pl.LightningModule):
            def __init__(self):
                super().__init__()
                self.l1 = torch.nn.Linear(8, 64)
                self.layer_out = torch.nn.Linear(64, 1)

            def forward(self, x):
                x = torch.relu(self.l1(x))
                x = self.layer_out(x)
                return x

            def training_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                loss = F.binary_cross_entropy_with_logits(y_hat, y)
                tensorboard_logs = {'train_loss': loss}
                return {'loss': loss, 'log': tensorboard_logs}

            def configure_optimizers(self):
                return torch.optim.Adam(self.parameters(), lr=0.001)

            def train_dataloader(self):
                return train_dataset

            def validation_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                return {
                    'val_loss': F.binary_cross_entropy_with_logits(y_hat, y)
                }

            def validation_epoch_end(self, outputs):
                avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
                tensorboard_logs = {'val_loss': avg_loss}
                return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

            def val_dataloader(self):
                return eval_dataset

        model = LitModel()

        # most basic trainer, uses good defaults
        trainer = Trainer(
            default_root_dir=self.log_dir,
            max_epochs=self.epoch,
        )
        trainer.fit(model)

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            trainer.save_checkpoint(os.path.join(temp_path, 'model.cpkt'))
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            trainer.save_checkpoint(
                os.path.join(self.serving_model_dir, 'model.ckpt'))
示例#6
0
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files,
                                     self.tf_transform_output)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = self.model_fn(train_dataset, eval_dataset)

        model.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        writer = SummaryWriter(self.log_dir)

        model.train()

        total_count = 0

        for e in range(1, self.epochs + 1):
            epoch_loss = 0
            epoch_acc = 0
            step_count = 0
            for x, y, _ in train_dataset:
                step_count += 1
                total_count += 1

                x_batch = torch.cat([v.to(device) for v in x.values()], dim=-1)
                y_batch = torch.cat([v.to(device) for v in y.values()], dim=-1)
                optimizer.zero_grad()

                y_pred = model(x_batch)

                loss = criterion(y_pred, y_batch)
                acc = binary_acc(y_pred, y_batch)

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                if e == 1 and step_count == 1:
                    writer.add_graph(model, x_batch)

                writer.add_scalar('training_loss', loss, total_count)
                writer.add_scalar('training_accuracy', acc, total_count)

            print(f'Epoch {e + 0:03}: | Loss: '
                  f'{epoch_loss / step_count:.5f} | Acc: '
                  f'{epoch_acc / step_count:.3f}')

        # test
        test_results = self.test_fn(model, eval_dataset)
        utils.save_test_results(test_results, self.test_results)

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            torch.save(model, temp_path)
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            torch.save(model, os.path.join(self.serving_model_dir, 'model.pt'))