Пример #1
0
    def _get_prior_parameters_observations(self):
        self._has_been_used = True

        parameters = np.load(
            os.path.join(utils.get_data_root(), "lotka-volterra",
                         "prior-parameters.npy"))

        observations = np.load(
            os.path.join(utils.get_data_root(), "lotka-volterra",
                         "prior-observations.npy"))

        ix = np.random.permutation(range(parameters.shape[0]))

        return parameters[ix], observations[ix]
Пример #2
0
    def _create_data(self):
        root = utils.get_data_root()
        path = os.path.join(root, 'faces', self.name + '.jpg')
        try:
            image = io.imread(path)
        except FileNotFoundError:
            raise RuntimeError('Unknown face name: {}'.format(self.name))
        image = color.rgb2gray(image)
        self.image = transform.resize(image, [512, 512])

        grid = np.array([(x, y) for x in range(self.image.shape[0])
                         for y in range(self.image.shape[1])])

        rotation_matrix = np.array([[0, -1], [1, 0]])
        p = self.image.reshape(-1) / sum(self.image.reshape(-1))
        ix = np.random.choice(range(len(grid)),
                              size=self.num_points,
                              replace=True,
                              p=p)
        points = grid[ix].astype(np.float32)
        points += np.random.rand(self.num_points, 2)  # dequantize
        points /= (self.image.shape[0])  # scale to [0, 1]
        # assert 0 <= min(points) <= max(points) <= 1

        self.data = torch.tensor(points @ rotation_matrix).float()
        self.data[:, 1] += 1
Пример #3
0
    def get_ground_truth_posterior_samples(self, num_samples=None):
        """
        We have pre-generated posterior samples using MCMC on the product of the analytic
        likelihood and a uniform prior on [-3, 3]^5.
        Thus they are ground truth as long as MCMC has behaved well.
        We load these once if samples have not been loaded before, store them for future use,
        and return as many as are requested.

        :param num_samples: int
            Number of sample to return.
        :return: torch.Tensor [num_samples, parameter_dim]
            Batch of posterior samples.
        """
        if self._posterior_samples is None:
            self._posterior_samples = torch.Tensor(
                np.load(
                    os.path.join(
                        utils.get_data_root(),
                        "nonlinear-gaussian",
                        "true-posterior-samples.npy",
                    )))
        if num_samples is not None:
            return self._posterior_samples[:num_samples]
        else:
            return self._posterior_samples
Пример #4
0
 def __init__(self, split='train', frac=None):
     path = os.path.join(utils.get_data_root(), 'miniboone',
                         '{}.npy'.format(split))
     self.data = np.load(path).astype(np.float32)
     self.n, self.dim = self.data.shape
     if frac is not None:
         self.n = int(frac * self.n)
def get_data_path(data_root=None):
    if data_root is None:
        data_root = get_data_root()

    data_path = data_root + 'Train_Sets/'

    return data_path
Пример #6
0
def save_splits():
    train, val, test = load_miniboone()
    splits = (('train', train), ('val', val), ('test', test))
    for split in splits:
        name, data = split
        file = os.path.join(utils.get_data_root(), 'miniboone',
                            '{}.npy'.format(name))
        np.save(file, data)
Пример #7
0
Файл: mg1.py Проект: yyht/lfi
    def __init__(self):

        n_percentiles = 5
        self.perc = np.linspace(0.0, 100.0, n_percentiles)

        path = os.path.join(utils.get_data_root(), "mg1",
                            "pilot_run_results.pkl")
        with open(path, "rb") as file:
            self.whiten_params = pickle.load(file, encoding="bytes")
Пример #8
0
    def _get_prior_parameters_observations(self):
        """
        Lotka-Volterra simulator is expensive, so load prior simulations from disk.
        :return: np.array, np.array
        """
        self._has_been_used = True

        file = ("prior-parameters.npy" if not self._gaussian_prior else
                "prior-parameters-gaussian.npy")
        parameters = np.load(
            os.path.join(utils.get_data_root(), "lotka-volterra", file))

        file = ("prior-observations.npy" if not self._gaussian_prior else
                "prior-observations-gaussian.npy")
        observations = np.load(
            os.path.join(utils.get_data_root(), "lotka-volterra", file))

        ix = np.random.permutation(range(parameters.shape[0]))

        return parameters[ix], observations[ix]
Пример #9
0
    def __init__(self, split='train', transform=None):
        self.transform = transform
        path = os.path.join(utils.get_data_root(), 'omniglot', 'omniglot.mat')
        rawdata = loadmat(path)

        if split == 'train':
            self.data = rawdata['data'].T.reshape(-1, 28, 28)
            self.targets = rawdata['target'].T
        elif split == 'test':
            self.data = rawdata['testdata'].T.reshape(-1, 28, 28)
            self.targets = rawdata['testtarget'].T
        else:
            raise ValueError
Пример #10
0
def load_miniboone():
    def load_data(path):
        # NOTE: To remember how the pre-processing was done.
        # data_ = pd.read_csv(root_path, names=[str(x) for x in range(50)], delim_whitespace=True)
        # print data_.head()
        # data_ = data_.as_matrix()
        # # Remove some random outliers
        # indices = (data_[:, 0] < -100)
        # data_ = data_[~indices]
        #
        # i = 0
        # # Remove any features that have too many re-occuring real values.
        # features_to_remove = []
        # for feature in data_.T:
        #     c = Counter(feature)
        #     max_count = np.array([v for k, v in sorted(c.iteritems())])[0]
        #     if max_count > 5:
        #         features_to_remove.append(i)
        #     i += 1
        # data_ = data_[:, np.array([i for i in range(data_.shape[1]) if i not in features_to_remove])]
        # np.save("~/data_/miniboone/data_.npy", data_)

        data = np.load(path)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data = data[0:-N_test]
        N_validate = int(0.1 * data.shape[0])
        data_validate = data[-N_validate:]
        data_train = data[0:-N_validate]

        return data_train, data_validate, data_test

    def load_data_normalised(path):
        data_train, data_validate, data_test = load_data(path)
        data = np.vstack((data_train, data_validate))
        mu = data.mean(axis=0)
        s = data.std(axis=0)
        data_train = (data_train - mu) / s
        data_validate = (data_validate - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_validate, data_test

    return load_data_normalised(
        path=os.path.join(utils.get_data_root(), 'miniboone', 'data.npy'))
def get_output_folder(data_root=None, modality=None):
    if data_root is None:
        data_root = get_data_root()

    output_folder_structure = get_output_folder_structure()

    if modality is None:
        folder = ''
    elif modality == get_ground_truth_keyword():
        folder = output_folder_structure['training']['ground_truth']
    else:
        folder = output_folder_structure['training']['images']

    output_folder = data_root + get_output_save_folder() + folder

    Path(output_folder).mkdir(parents=True, exist_ok=True)

    return output_folder
Пример #12
0
def load_gas():
    def load_data(file):
        data = pd.read_pickle(file)
        data.drop("Meth", axis=1, inplace=True)
        data.drop("Eth", axis=1, inplace=True)
        data.drop("Time", axis=1, inplace=True)
        return data

    def get_correlation_numbers(data):
        C = data.corr()
        A = C > 0.98
        B = A.sum(axis=1)
        return B

    def load_data_and_clean(file):
        data = load_data(file)
        B = get_correlation_numbers(data)

        while np.any(B > 1):
            col_to_remove = np.where(B > 1)[0][0]
            col_name = data.columns[col_to_remove]
            data.drop(col_name, axis=1, inplace=True)
            B = get_correlation_numbers(data)
        data = (data - data.mean()) / data.std()

        return data.values

    def load_data_and_clean_and_split(file):
        data = load_data_and_clean(file)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data_train = data[0:-N_test]
        N_validate = int(0.1 * data_train.shape[0])
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    return load_data_and_clean_and_split(
        file=os.path.join(utils.get_data_root(), 'gas', 'ethylene_CO.pickle'))
Пример #13
0
    def __init__(self):

        path = os.path.join(utils.get_data_root(), "lotka-volterra",
                            "pilot_run_results.pkl")
        with open(path, "rb") as file:
            self.means, self.stds = pickle.load(file, encoding="bytes")
Пример #14
0
def get_ground_truth_observation():
    path = os.path.join(utils.get_data_root(), "lotka-volterra",
                        "obs_stats.pkl")
    with open(path, "rb") as file:
        true_observation = pickle.load(file, encoding="bytes")
    return np.array(true_observation)
Пример #15
0
Файл: mg1.py Проект: yyht/lfi
 def get_ground_truth_observation(self):
     path = os.path.join(utils.get_data_root(), "mg1", "observed_data.pkl")
     with open(path, "rb") as file:
         _, true_observation = pickle.load(file, encoding="bytes")
     return torch.Tensor(true_observation)
Пример #16
0
 def load_data():
     file = os.path.join(utils.get_data_root(), 'power', 'data.npy')
     return np.load(file)
Пример #17
0
from torchvision.utils import make_grid, save_image

from nde import distributions, transforms, flows
import utils
import optim
import nn as nn_

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

# Capture job id on the cluster
sacred.SETTINGS.HOST_INFO.CAPTURED_ENV.append('SLURM_JOB_ID')

runs_dir = os.path.join(utils.get_data_root(), 'runs/images')
ex = Experiment('decomposition-flows-images')

fso = observers.FileStorageObserver.create(runs_dir, priority=1)
# I don't like how sacred names run folders.
ex.observers.extend([fso, autils.NamingObserver(runs_dir, priority=2)])

# For num_workers > 0 and tensor datasets, bad things happen otherwise.
torch.multiprocessing.set_start_method("spawn", force=True)


# noinspection PyUnusedLocal
@ex.config
def config():
    # Dataset
    dataset = 'fashion-mnist'
Пример #18
0
def run(seed):

    assert torch.cuda.is_available()
    device = torch.device('cuda')
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

    np.random.seed(seed)
    torch.manual_seed(seed)

    # Create training data.
    data_transform = tvtransforms.Compose(
        [tvtransforms.ToTensor(),
         tvtransforms.Lambda(torch.bernoulli)])

    if args.dataset_name == 'mnist':
        dataset = datasets.MNIST(root=os.path.join(utils.get_data_root(),
                                                   'mnist'),
                                 train=True,
                                 download=True,
                                 transform=data_transform)
        test_dataset = datasets.MNIST(root=os.path.join(
            utils.get_data_root(), 'mnist'),
                                      train=False,
                                      download=True,
                                      transform=data_transform)
    elif args.dataset_name == 'fashion-mnist':
        dataset = datasets.FashionMNIST(root=os.path.join(
            utils.get_data_root(), 'fashion-mnist'),
                                        train=True,
                                        download=True,
                                        transform=data_transform)
        test_dataset = datasets.FashionMNIST(root=os.path.join(
            utils.get_data_root(), 'fashion-mnist'),
                                             train=False,
                                             download=True,
                                             transform=data_transform)
    elif args.dataset_name == 'omniglot':
        dataset = data_.OmniglotDataset(split='train',
                                        transform=data_transform)
        test_dataset = data_.OmniglotDataset(split='test',
                                             transform=data_transform)
    elif args.dataset_name == 'emnist':
        rotate = partial(tvF.rotate, angle=-90)
        hflip = tvF.hflip
        data_transform = tvtransforms.Compose([
            tvtransforms.Lambda(rotate),
            tvtransforms.Lambda(hflip),
            tvtransforms.ToTensor(),
            tvtransforms.Lambda(torch.bernoulli)
        ])
        dataset = datasets.EMNIST(root=os.path.join(utils.get_data_root(),
                                                    'emnist'),
                                  split='letters',
                                  train=True,
                                  transform=data_transform,
                                  download=True)
        test_dataset = datasets.EMNIST(root=os.path.join(
            utils.get_data_root(), 'emnist'),
                                       split='letters',
                                       train=False,
                                       transform=data_transform,
                                       download=True)
    else:
        raise ValueError

    if args.dataset_name == 'omniglot':
        split = -1345
    elif args.dataset_name == 'emnist':
        split = -20000
    else:
        split = -10000
    indices = np.arange(len(dataset))
    np.random.shuffle(indices)
    train_indices, val_indices = indices[:split], indices[split:]
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)
    train_loader = data.DataLoader(
        dataset=dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=4 if args.dataset_name == 'emnist' else 0)
    train_generator = data_.batch_generator(train_loader)
    val_loader = data.DataLoader(dataset=dataset,
                                 batch_size=1024,
                                 sampler=val_sampler,
                                 shuffle=False,
                                 drop_last=False)
    val_batch = next(iter(val_loader))[0]
    test_loader = data.DataLoader(
        test_dataset,
        batch_size=16,
        shuffle=False,
        drop_last=False,
    )

    # from matplotlib import pyplot as plt
    # from experiments import cutils
    # from torchvision.utils import make_grid
    # fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    # cutils.gridimshow(make_grid(val_batch[:64], nrow=8), ax)
    # plt.show()
    # quit()

    def create_linear_transform():
        if args.linear_type == 'lu':
            return transforms.CompositeTransform([
                transforms.RandomPermutation(args.latent_features),
                transforms.LULinear(args.latent_features, identity_init=True)
            ])
        elif args.linear_type == 'svd':
            return transforms.SVDLinear(args.latent_features,
                                        num_householder=4,
                                        identity_init=True)
        elif args.linear_type == 'perm':
            return transforms.RandomPermutation(args.latent_features)
        else:
            raise ValueError

    def create_base_transform(i, context_features=None):
        if args.prior_type == 'affine-coupling':
            return transforms.AffineCouplingTransform(
                mask=utils.create_alternating_binary_mask(
                    features=args.latent_features, even=(i % 2 == 0)),
                transform_net_create_fn=lambda in_features, out_features: nn_.
                ResidualNet(in_features=in_features,
                            out_features=out_features,
                            hidden_features=args.hidden_features,
                            context_features=context_features,
                            num_blocks=args.num_transform_blocks,
                            activation=F.relu,
                            dropout_probability=args.dropout_probability,
                            use_batch_norm=args.use_batch_norm))
        elif args.prior_type == 'rq-coupling':
            return transforms.PiecewiseRationalQuadraticCouplingTransform(
                mask=utils.create_alternating_binary_mask(
                    features=args.latent_features, even=(i % 2 == 0)),
                transform_net_create_fn=lambda in_features, out_features: nn_.
                ResidualNet(in_features=in_features,
                            out_features=out_features,
                            hidden_features=args.hidden_features,
                            context_features=context_features,
                            num_blocks=args.num_transform_blocks,
                            activation=F.relu,
                            dropout_probability=args.dropout_probability,
                            use_batch_norm=args.use_batch_norm),
                num_bins=args.num_bins,
                tails='linear',
                tail_bound=args.tail_bound,
                apply_unconditional_transform=args.
                apply_unconditional_transform,
            )
        elif args.prior_type == 'affine-autoregressive':
            return transforms.MaskedAffineAutoregressiveTransform(
                features=args.latent_features,
                hidden_features=args.hidden_features,
                context_features=context_features,
                num_blocks=args.num_transform_blocks,
                use_residual_blocks=True,
                random_mask=False,
                activation=F.relu,
                dropout_probability=args.dropout_probability,
                use_batch_norm=args.use_batch_norm)
        elif args.prior_type == 'rq-autoregressive':
            return transforms.MaskedPiecewiseRationalQuadraticAutoregressiveTransform(
                features=args.latent_features,
                hidden_features=args.hidden_features,
                context_features=context_features,
                num_bins=args.num_bins,
                tails='linear',
                tail_bound=args.tail_bound,
                num_blocks=args.num_transform_blocks,
                use_residual_blocks=True,
                random_mask=False,
                activation=F.relu,
                dropout_probability=args.dropout_probability,
                use_batch_norm=args.use_batch_norm)
        else:
            raise ValueError

    # ---------------
    # prior
    # ---------------
    def create_prior():
        if args.prior_type == 'standard-normal':
            prior = distributions_.StandardNormal((args.latent_features, ))

        else:
            distribution = distributions_.StandardNormal(
                (args.latent_features, ))
            transform = transforms.CompositeTransform([
                transforms.CompositeTransform(
                    [create_linear_transform(),
                     create_base_transform(i)])
                for i in range(args.num_flow_steps)
            ])
            transform = transforms.CompositeTransform(
                [transform, create_linear_transform()])
            prior = flows.Flow(transform, distribution)

        return prior

    # ---------------
    # inputs encoder
    # ---------------
    def create_inputs_encoder():
        if args.approximate_posterior_type == 'diagonal-normal':
            inputs_encoder = None
        else:
            inputs_encoder = nn_.ConvEncoder(
                context_features=args.context_features,
                channels_multiplier=16,
                dropout_probability=args.dropout_probability_encoder_decoder)
        return inputs_encoder

    # ---------------
    # approximate posterior
    # ---------------
    def create_approximate_posterior():
        if args.approximate_posterior_type == 'diagonal-normal':
            context_encoder = nn_.ConvEncoder(
                context_features=args.context_features,
                channels_multiplier=16,
                dropout_probability=args.dropout_probability_encoder_decoder)
            approximate_posterior = distributions_.ConditionalDiagonalNormal(
                shape=[args.latent_features], context_encoder=context_encoder)

        else:
            context_encoder = nn.Linear(args.context_features,
                                        2 * args.latent_features)
            distribution = distributions_.ConditionalDiagonalNormal(
                shape=[args.latent_features], context_encoder=context_encoder)

            transform = transforms.CompositeTransform([
                transforms.CompositeTransform([
                    create_linear_transform(),
                    create_base_transform(
                        i, context_features=args.context_features)
                ]) for i in range(args.num_flow_steps)
            ])
            transform = transforms.CompositeTransform(
                [transform, create_linear_transform()])
            approximate_posterior = flows.Flow(
                transforms.InverseTransform(transform), distribution)

        return approximate_posterior

    # ---------------
    # likelihood
    # ---------------
    def create_likelihood():
        latent_decoder = nn_.ConvDecoder(
            latent_features=args.latent_features,
            channels_multiplier=16,
            dropout_probability=args.dropout_probability_encoder_decoder)

        likelihood = distributions_.ConditionalIndependentBernoulli(
            shape=[1, 28, 28], context_encoder=latent_decoder)

        return likelihood

    prior = create_prior()
    approximate_posterior = create_approximate_posterior()
    likelihood = create_likelihood()
    inputs_encoder = create_inputs_encoder()

    model = vae.VariationalAutoencoder(
        prior=prior,
        approximate_posterior=approximate_posterior,
        likelihood=likelihood,
        inputs_encoder=inputs_encoder)

    # with torch.no_grad():
    #     # elbo = model.stochastic_elbo(val_batch[:16].to(device)).mean()
    #     # print(elbo)
    #     elbo = model.stochastic_elbo(val_batch[:16].to(device), num_samples=100).mean()
    #     print(elbo)
    #     log_prob = model.log_prob_lower_bound(val_batch[:16].to(device), num_samples=1200).mean()
    #     print(log_prob)
    # quit()

    n_params = utils.get_num_parameters(model)
    print('There are {} trainable parameters in this model.'.format(n_params))

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer=optimizer, T_max=args.num_training_steps, eta_min=0)

    def get_kl_multiplier(step):
        if args.kl_multiplier_schedule == 'constant':
            return args.kl_multiplier_initial
        elif args.kl_multiplier_schedule == 'linear':
            multiplier = min(
                step / (args.num_training_steps * args.kl_warmup_fraction), 1.)
            return args.kl_multiplier_initial * (1. + multiplier)

    # create summary writer and write to log directory
    timestamp = cutils.get_timestamp()
    if cutils.on_cluster():
        timestamp += '||{}'.format(os.environ['SLURM_JOB_ID'])
    log_dir = os.path.join(cutils.get_log_root(), args.dataset_name, timestamp)
    while True:
        try:
            writer = SummaryWriter(log_dir=log_dir, max_queue=20)
            break
        except FileExistsError:
            sleep(5)
    filename = os.path.join(log_dir, 'config.json')
    with open(filename, 'w') as file:
        json.dump(vars(args), file)

    best_val_elbo = -np.inf
    tbar = tqdm(range(args.num_training_steps))
    for step in tbar:
        model.train()
        optimizer.zero_grad()
        scheduler.step(step)

        batch = next(train_generator)[0].to(device)
        elbo = model.stochastic_elbo(batch,
                                     kl_multiplier=get_kl_multiplier(step))
        loss = -torch.mean(elbo)
        loss.backward()
        optimizer.step()

        if (step + 1) % args.monitor_interval == 0:
            model.eval()
            with torch.no_grad():
                elbo = model.stochastic_elbo(val_batch.to(device))
                mean_val_elbo = elbo.mean()

            if mean_val_elbo > best_val_elbo:
                best_val_elbo = mean_val_elbo
                path = os.path.join(
                    cutils.get_checkpoint_root(),
                    '{}-best-val-{}.t'.format(args.dataset_name, timestamp))
                torch.save(model.state_dict(), path)

            writer.add_scalar(tag='val-elbo',
                              scalar_value=mean_val_elbo,
                              global_step=step)

            writer.add_scalar(tag='best-val-elbo',
                              scalar_value=best_val_elbo,
                              global_step=step)

            with torch.no_grad():
                samples = model.sample(64)
            fig, ax = plt.subplots(figsize=(10, 10))
            cutils.gridimshow(make_grid(samples.view(64, 1, 28, 28), nrow=8),
                              ax)
            writer.add_figure(tag='vae-samples', figure=fig, global_step=step)
            plt.close()

    # load best val model
    path = os.path.join(
        cutils.get_checkpoint_root(),
        '{}-best-val-{}.t'.format(args.dataset_name, timestamp))
    model.load_state_dict(torch.load(path))
    model.eval()

    np.random.seed(5)
    torch.manual_seed(5)

    # compute elbo on test set
    with torch.no_grad():
        elbo = torch.Tensor([])
        log_prob_lower_bound = torch.Tensor([])
        for batch in tqdm(test_loader):
            elbo_ = model.stochastic_elbo(batch[0].to(device))
            elbo = torch.cat([elbo, elbo_])
            log_prob_lower_bound_ = model.log_prob_lower_bound(
                batch[0].to(device), num_samples=1000)
            log_prob_lower_bound = torch.cat(
                [log_prob_lower_bound, log_prob_lower_bound_])
    path = os.path.join(
        log_dir, '{}-prior-{}-posterior-{}-elbo.npy'.format(
            args.dataset_name, args.prior_type,
            args.approximate_posterior_type))
    np.save(path, utils.tensor2numpy(elbo))
    path = os.path.join(
        log_dir, '{}-prior-{}-posterior-{}-log-prob-lower-bound.npy'.format(
            args.dataset_name, args.prior_type,
            args.approximate_posterior_type))
    np.save(path, utils.tensor2numpy(log_prob_lower_bound))

    # save elbo and log prob lower bound
    mean_elbo = elbo.mean()
    std_elbo = elbo.std()
    mean_log_prob_lower_bound = log_prob_lower_bound.mean()
    std_log_prob_lower_bound = log_prob_lower_bound.std()
    s = 'ELBO: {:.2f} +- {:.2f}, LOG PROB LOWER BOUND: {:.2f} +- {:.2f}'.format(
        mean_elbo.item(), 2 * std_elbo.item() / np.sqrt(len(test_dataset)),
        mean_log_prob_lower_bound.item(),
        2 * std_log_prob_lower_bound.item() / np.sqrt(len(test_dataset)))
    filename = os.path.join(log_dir, 'test-results.txt')
    with open(filename, 'w') as file:
        file.write(s)
Пример #19
0
def load_hepmass():
    def load_data(path):

        data_train = pd.read_csv(filepath_or_buffer=os.path.join(
            path, '1000_train.csv'),
                                 index_col=False)
        data_test = pd.read_csv(filepath_or_buffer=os.path.join(
            path, '1000_test.csv'),
                                index_col=False)

        return data_train, data_test

    def load_data_no_discrete(path):
        """Loads the positive class examples from the first 10% of the dataset."""
        data_train, data_test = load_data(path)

        # Gets rid of any background noise examples i.e. class label 0.
        data_train = data_train[data_train[data_train.columns[0]] == 1]
        data_train = data_train.drop(data_train.columns[0], axis=1)
        data_test = data_test[data_test[data_test.columns[0]] == 1]
        data_test = data_test.drop(data_test.columns[0], axis=1)
        # Because the data_ set is messed up!
        data_test = data_test.drop(data_test.columns[-1], axis=1)

        return data_train, data_test

    def load_data_no_discrete_normalised(path):

        data_train, data_test = load_data_no_discrete(path)
        mu = data_train.mean()
        s = data_train.std()
        data_train = (data_train - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_test

    def load_data_no_discrete_normalised_as_array(path):

        data_train, data_test = load_data_no_discrete_normalised(path)
        data_train, data_test = data_train.values, data_test.values

        i = 0
        # Remove any features that have too many re-occurring real values.
        features_to_remove = []
        for feature in data_train.T:
            c = Counter(feature)
            max_count = np.array([v for k, v in sorted(c.items())])[0]
            if max_count > 5:
                features_to_remove.append(i)
            i += 1
        data_train = data_train[:,
                                np.array([
                                    i for i in range(data_train.shape[1])
                                    if i not in features_to_remove
                                ])]
        data_test = data_test[:,
                              np.array([
                                  i for i in range(data_test.shape[1])
                                  if i not in features_to_remove
                              ])]

        N = data_train.shape[0]
        N_validate = int(N * 0.1)
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    return load_data_no_discrete_normalised_as_array(
        path=os.path.join(utils.get_data_root(), 'hepmass'))
Пример #20
0
def load_bsds300():
    path = os.path.join(utils.get_data_root(), 'bsds300', 'bsds300.hdf5')
    file = h5py.File(path, 'r')
    return file['train'], file['validation'], file['test']