def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """

        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)
        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        from tensorpack.callbacks import CometMLMonitor

        trainer = SeparateGANTrainer(
            model=self.model,
            input_queue=input_queue,
            g_period=6,
        )

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1

        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else 'd'
        # logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        monitors = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))
        callbacks.append(MergeAllSummaries(period=10))

        if self.experiment is not None:
            monitors.append(CometMLMonitor(experiment=self.experiment))

        trainer.train_with_defaults(callbacks=callbacks,
                                    monitors=monitors,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()
Пример #2
0
    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)
        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        trainer = self.get_trainer(
            model=self.model,
            input_queue=input_queue,
        )

        self.restore_path = os.path.join(self.model_dir, "checkpoint")

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, "stats.json")) as f:
                starting_epoch = json.load(f)[-1]["epoch_num"] + 1

        else:
            session_init = None
            starting_epoch = 1

        action = "k" if self.restore_session else None
        logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))

        trainer.train_with_defaults(callbacks=callbacks,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()
class TGANModel:
    """Main model from TGAN.

    Args:
        continuous_columns (list[int]): 0-index list of column indices to be considered continuous.
        output (str, optional): Path to store the model and its artifacts. Defaults to
            :attr:`output`.
        gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`.
        max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`.
        steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to
            :attr:`10000`.
        save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after
            each training epoch. Defaults to :attr:`True`
        restore_session(bool, optional): Whether or not continue training from the last checkpoint.
            Defaults to :attr:`True`.
        batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to
            :attr:`200`.
        z_dim (int, optional): Number of dimensions in the noise input for the generator.
            Defaults to :attr:`100`.
        noise (float, optional): Upper bound to the gaussian noise added to categorical columns.
            Defaults to :attr:`0.2`.
        l2norm (float, optional):
            L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`.
        learning_rate (float, optional): Learning rate for the optimizer. Defaults to
            :attr:`0.001`.
        num_gen_rnn (int, optional): Defaults to :attr:`400`.
        num_gen_feature (int, optional): Number of features of in the generator. Defaults to
            :attr:`100`
        num_dis_layers (int, optional): Defaults to :attr:`2`.
        num_dis_hidden (int, optional): Defaults to :attr:`200`.
        optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are:
            [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to
            :attr:`AdamOptimizer`.
    """
    def __init__(self,
                 continuous_columns,
                 output='output',
                 gpu=None,
                 max_epoch=5,
                 steps_per_epoch=10000,
                 save_checkpoints=True,
                 restore_session=True,
                 batch_size=200,
                 z_dim=200,
                 noise=0.2,
                 l2norm=0.00001,
                 learning_rate=0.001,
                 num_gen_rnn=100,
                 num_gen_feature=100,
                 num_dis_layers=1,
                 num_dis_hidden=100,
                 optimizer='AdamOptimizer',
                 comet_ml_key=None,
                 experiment=None,
                 ds=None):
        """Initialize object."""
        # Output
        self.continuous_columns = continuous_columns
        self.log_dir = os.path.join(output, 'logs')
        self.model_dir = os.path.join(output, 'model')
        self.output = output

        # Training params
        self.max_epoch = max_epoch
        self.steps_per_epoch = steps_per_epoch
        self.save_checkpoints = save_checkpoints
        self.restore_session = restore_session

        # Model params
        self.model = None
        self.batch_size = batch_size
        self.z_dim = z_dim
        self.noise = noise
        self.l2norm = l2norm
        self.learning_rate = learning_rate
        self.num_gen_rnn = num_gen_rnn
        self.num_gen_feature = num_gen_feature
        self.num_dis_layers = num_dis_layers
        self.num_dis_hidden = num_dis_hidden
        self.optimizer = optimizer

        if gpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = gpu

        if experiment is not None:
            self.experiment = experiment
        elif comet_ml_key is not None:
            self.comet_ml_key = comet_ml_key
            self.experiment = Experiment(api_key=comet_ml_key,
                                         project_name='tgan-wgan-gp',
                                         workspace="baukebrenninkmeijer")
        if ds is not None:
            experiment.log_dataset_info(name=ds)
        self.gpu = gpu

    def get_model(self, training=True):
        """Return a new instance of the model."""
        return GraphBuilder(metadata=self.metadata,
                            batch_size=self.batch_size,
                            z_dim=self.z_dim,
                            noise=self.noise,
                            l2norm=self.l2norm,
                            learning_rate=self.learning_rate,
                            num_gen_rnn=self.num_gen_rnn,
                            num_gen_feature=self.num_gen_feature,
                            num_dis_layers=self.num_dis_layers,
                            num_dis_hidden=self.num_dis_hidden,
                            optimizer=self.optimizer,
                            training=training)

    def prepare_sampling(self):
        """Prepare model for generate samples."""
        if self.model is None:
            self.model = self.get_model(training=False)

        else:
            self.model.training = False

        predict_config = PredictConfig(
            session_init=SaverRestore(self.restore_path),
            model=self.model,
            input_names=['z'],
            output_names=['gen/gen', 'z'],
        )

        self.simple_dataset_predictor = SimpleDatasetPredictor(
            predict_config, RandomZData((self.batch_size, self.z_dim)))

    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)
        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        trainer = GANTrainer(
            model=self.model,
            input_queue=input_queue,
        )

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1

        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else None
        # logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        monitors = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))
        callbacks.append(MergeAllSummaries(period=10))

        if self.experiment is not None:
            monitors.append(CometMLMonitor(experiment=self.experiment))

        trainer.train_with_defaults(callbacks=callbacks,
                                    monitors=monitors,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()

    def sample(self, num_samples):
        """Generate samples from model.

        Args:
            num_samples(int)

        Returns:
            None

        Raises:
            ValueError

        """
        max_iters = (num_samples // self.batch_size)

        results = []
        for idx, o in enumerate(self.simple_dataset_predictor.get_result()):
            results.append(o[0])
            if idx + 1 == max_iters:
                break

        results = np.concatenate(results, axis=0)

        ptr = 0
        features = {}
        for col_id, col_info in enumerate(self.metadata['details']):
            if col_info['type'] == 'category':
                features['f%02d' % col_id] = results[:, ptr:ptr + 1]
                ptr += 1

            elif col_info['type'] == 'value':
                gaussian_components = col_info['n']
                val = results[:, ptr:ptr + 1]
                ptr += 1
                pro = results[:, ptr:ptr + gaussian_components]
                ptr += gaussian_components
                features['f%02d' % col_id] = np.concatenate([val, pro], axis=1)

            else:
                raise ValueError(
                    "self.metadata['details'][{}]['type'] must be either `category` or "
                    "`values`. Instead it was {}.".format(
                        col_id, col_info['type']))

        return self.preprocessor.reverse_transform(
            features)[:num_samples].copy()

    def tar_folder(self, tar_name):
        """Generate a tar of :self.output:."""
        with tarfile.open(tar_name, 'w:gz') as tar_handle:
            for root, dirs, files in os.walk(self.output):
                for file_ in files:
                    tar_handle.add(os.path.join(root, file_))

            tar_handle.close()

    @classmethod
    def load(cls, path):
        """Load a pretrained model from a given path."""
        with tarfile.open(path, 'r:gz') as tar_handle:
            destination_dir = os.path.dirname(tar_handle.getmembers()[0].name)
            tar_handle.extractall()

        with open('{}/TGANModel'.format(destination_dir), 'rb+') as f:
            instance = pickle.load(f)

        instance.prepare_sampling()
        return instance

    def save(self, path, force=False):
        """Save the fitted model in the given path."""
        if os.path.exists(path) and not force:
            logger.info(
                'The indicated path already exists. Use `force=True` to overwrite.'
            )
            return

        base_path = os.path.dirname(path)
        if not os.path.exists(base_path):
            os.makedirs(base_path)

        model = self.model
        dataset_predictor = self.simple_dataset_predictor

        self.model = None
        self.simple_dataset_predictor = None

        with open('{}/TGANModel'.format(self.output), 'wb') as f:
            pickle.dump(self, f)

        self.model = model
        self.simple_dataset_predictor = dataset_predictor

        self.tar_folder(path)

        logger.info('Model saved successfully.')
Пример #4
0
class TGANModel:
    """Main model from TGAN.

    Args:
        continuous_columns (list[int]): 0-index list of column indices to be considered continuous.
        output (str, optional): Path to store the model and its artifacts. Defaults to
            :attr:`output`.
        gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`.
        max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`.
        steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to
            :attr:`10000`.
        save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after
            each training epoch. Defaults to :attr:`True`
        restore_session(bool, optional): Whether or not continue training from the last checkpoint.
            Defaults to :attr:`True`.
        batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to
            :attr:`200`.
        z_dim (int, optional): Number of dimensions in the noise input for the generator.
            Defaults to :attr:`100`.
        noise (float, optional): Upper bound to the gaussian noise added to categorical columns.
            Defaults to :attr:`0.2`.
        l2norm (float, optional):
            L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`.
        learning_rate (float, optional): Learning rate for the optimizer. Defaults to
            :attr:`0.001`.
        num_gen_rnn (int, optional): Defaults to :attr:`400`.
        num_gen_feature (int, optional): Number of features of in the generator. Defaults to
            :attr:`100`
        num_dis_layers (int, optional): Defaults to :attr:`2`.
        num_dis_hidden (int, optional): Defaults to :attr:`200`.
        optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are:
            [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to
            :attr:`AdamOptimizer`.
    """
    def __init__(
        self,
        continuous_columns,
        output="output",
        gpus=None,
        max_epoch=5,
        steps_per_epoch=10000,
        save_checkpoints=True,
        restore_session=True,
        batch_size=200,
        z_dim=200,
        noise=0.2,
        l2norm=0.00001,
        learning_rate=0.001,
        num_gen_rnn=100,
        num_gen_feature=100,
        num_dis_layers=1,
        num_dis_hidden=100,
        optimizer="AdamOptimizer",
    ):
        """Initialize object."""
        # Output
        self.continuous_columns = continuous_columns
        self.log_dir = os.path.join(output, "logs")
        self.model_dir = os.path.join(output, "model")
        self.output = output

        # Training params
        self.max_epoch = max_epoch
        self.steps_per_epoch = steps_per_epoch
        self.save_checkpoints = save_checkpoints
        self.restore_session = restore_session

        # Model params
        self.model = None
        self.batch_size = batch_size
        self.z_dim = z_dim
        self.noise = noise
        self.l2norm = l2norm
        self.learning_rate = learning_rate
        self.num_gen_rnn = num_gen_rnn
        self.num_gen_feature = num_gen_feature
        self.num_dis_layers = num_dis_layers
        self.num_dis_hidden = num_dis_hidden
        self.optimizer = optimizer
        self.gpus = gpus or self.get_gpus()
        if self.gpus:
            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
                list(map(str, self.gpus)))

    def get_gpus(self):
        """Setting up GPU."""
        return [
            x.locality.bus_id for x in device_lib.list_local_devices()
            if x.device_type == "GPU"
        ]

    def get_model(self, training=True):
        """Return a new instance of the model."""
        return GraphBuilder(metadata=self.metadata,
                            batch_size=self.batch_size,
                            z_dim=self.z_dim,
                            noise=self.noise,
                            l2norm=self.l2norm,
                            learning_rate=self.learning_rate,
                            num_gen_rnn=self.num_gen_rnn,
                            num_gen_feature=self.num_gen_feature,
                            num_dis_layers=self.num_dis_layers,
                            num_dis_hidden=self.num_dis_hidden,
                            optimizer=self.optimizer,
                            training=training)

    def prepare_sampling(self):
        """Prepare model for generate samples."""
        if self.model is None:
            self.model = self.get_model(training=False)

        else:
            self.model.training = False

        predict_config = PredictConfig(
            session_init=SaverRestore(self.restore_path),
            model=self.model,
            input_names=["z"],
            output_names=["gen/gen", "z"],
        )

        self.simple_dataset_predictor = SimpleDatasetPredictor(
            predict_config, RandomZData((self.batch_size, self.z_dim)))

    def get_trainer(self, model, input_queue):
        if len(self.gpus) > 1:
            return MultiGPUGANTrainer(self.gpus)
        else:
            return GANTrainer(
                model=self.model,
                input_queue=input_queue,
            )

    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)
        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        trainer = self.get_trainer(
            model=self.model,
            input_queue=input_queue,
        )

        self.restore_path = os.path.join(self.model_dir, "checkpoint")

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, "stats.json")) as f:
                starting_epoch = json.load(f)[-1]["epoch_num"] + 1

        else:
            session_init = None
            starting_epoch = 1

        action = "k" if self.restore_session else None
        logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))

        trainer.train_with_defaults(callbacks=callbacks,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()

    def sample(self, num_samples):
        """Generate samples from model.

        Args:
            num_samples(int)

        Returns:
            None

        Raises:
            ValueError

        """
        max_iters = (num_samples // self.batch_size)

        results = []
        for idx, o in enumerate(self.simple_dataset_predictor.get_result()):
            results.append(o[0])
            if idx + 1 == max_iters:
                break

        results = np.concatenate(results, axis=0)

        ptr = 0
        features = {}
        for col_id, col_info in enumerate(self.metadata["details"]):
            if col_info["type"] == "category":
                features["f%02d" % col_id] = results[:, ptr:ptr + 1]
                ptr += 1

            elif col_info["type"] == "value":
                gaussian_components = col_info["n"]
                val = results[:, ptr:ptr + 1]
                ptr += 1
                pro = results[:, ptr:ptr + gaussian_components]
                ptr += gaussian_components
                features["f%02d" % col_id] = np.concatenate([val, pro], axis=1)

            else:
                raise ValueError(
                    "self.metadata['details'][{}]['type'] must be either `category` or "
                    "`values`. Instead it was {}.".format(
                        col_id, col_info["type"]))

        return self.preprocessor.reverse_transform(
            features)[:num_samples].copy()

    def tar_folder(self, tar_name):
        """Generate a tar of :self.output:."""
        with tarfile.open(tar_name, "w:gz") as tar_handle:
            for root, dirs, files in os.walk(self.output):
                for file_ in files:
                    tar_handle.add(os.path.join(root, file_))

            tar_handle.close()

    @classmethod
    def load(cls, path):
        """Load a pretrained model from a given path."""
        with tarfile.open(path, "r:gz") as tar_handle:
            destination_dir = os.path.dirname(tar_handle.getmembers()[0].name)
            tar_handle.extractall()

        with open("{}/TGANModel".format(destination_dir), "rb") as f:
            instance = pickle.load(f)

        instance.prepare_sampling()
        return instance

    def save(self, path, force=False):
        """Save the fitted model in the given path."""
        if os.path.exists(path) and not force:
            logger.info(
                "The indicated path already exists. Use `force=True` to overwrite."
            )
            return

        base_path = os.path.dirname(path)
        if not os.path.exists(base_path):
            os.makedirs(base_path)

        model = self.model
        dataset_predictor = self.simple_dataset_predictor

        self.model = None
        self.simple_dataset_predictor = None

        with open("{}/TGANModel".format(self.output), "wb") as f:
            pickle.dump(self, f)

        self.model = model
        self.simple_dataset_predictor = dataset_predictor

        self.tar_folder(path)

        logger.info("Model saved successfully.")