Пример #1
0
    def _create_data(self):
        root = utils.get_data_root()
        path = os.path.join(root, 'faces', self.name + '.jpg')
        try:
            image = io.imread(path)
        except FileNotFoundError:
            raise RuntimeError('Unknown face name: {}'.format(self.name))
        image = color.rgb2gray(image)
        self.image = transform.resize(image, [512, 512])

        grid = np.array([(x, y) for x in range(self.image.shape[0])
                         for y in range(self.image.shape[1])])

        rotation_matrix = np.array([[0, -1], [1, 0]])
        p = self.image.reshape(-1) / sum(self.image.reshape(-1))
        ix = np.random.choice(range(len(grid)),
                              size=self.num_points,
                              replace=True,
                              p=p)
        points = grid[ix].astype(np.float32)
        points += np.random.rand(self.num_points, 2)  # dequantize
        points /= (self.image.shape[0])  # scale to [0, 1]
        # assert 0 <= min(points) <= max(points) <= 1

        self.data = torch.tensor(points @ rotation_matrix).float()
        self.data[:, 1] += 1
Пример #2
0
 def __init__(self, split='train', frac=None):
     path = os.path.join(utils.get_data_root(), 'power',
                         '{}.npy'.format(split))
     self.data = np.load(path).astype(np.float32)
     self.n, self.dim = self.data.shape
     if frac is not None:
         self.n = int(frac * self.n)
Пример #3
0
def save_splits():
    train, val, test = load_power()
    splits = (('train', train), ('val', val), ('test', test))
    for split in splits:
        name, data = split
        file = os.path.join(utils.get_data_root(), 'power',
                            '{}.npy'.format(name))
        np.save(file, data)
Пример #4
0
    def __init__(self, split='train', transform=None):
        self.transform = transform
        path = os.path.join(utils.get_data_root(), 'omniglot', 'omniglot.mat')
        rawdata = loadmat(path)

        if split == 'train':
            self.data = rawdata['data'].T.reshape(-1, 28, 28)
            self.targets = rawdata['target'].T
        elif split == 'test':
            self.data = rawdata['testdata'].T.reshape(-1, 28, 28)
            self.targets = rawdata['testtarget'].T
        else:
            raise ValueError
Пример #5
0
def load_miniboone():
    def load_data(path):
        # NOTE: To remember how the pre-processing was done.
        # data_ = pd.read_csv(root_path, names=[str(x) for x in range(50)], delim_whitespace=True)
        # print data_.head()
        # data_ = data_.as_matrix()
        # # Remove some random outliers
        # indices = (data_[:, 0] < -100)
        # data_ = data_[~indices]
        #
        # i = 0
        # # Remove any features that have too many re-occuring real values.
        # features_to_remove = []
        # for feature in data_.T:
        #     c = Counter(feature)
        #     max_count = np.array([v for k, v in sorted(c.iteritems())])[0]
        #     if max_count > 5:
        #         features_to_remove.append(i)
        #     i += 1
        # data_ = data_[:, np.array([i for i in range(data_.shape[1]) if i not in features_to_remove])]
        # np.save("~/data_/miniboone/data_.npy", data_)

        data = np.load(path)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data = data[0:-N_test]
        N_validate = int(0.1 * data.shape[0])
        data_validate = data[-N_validate:]
        data_train = data[0:-N_validate]

        return data_train, data_validate, data_test

    def load_data_normalised(path):
        data_train, data_validate, data_test = load_data(path)
        data = np.vstack((data_train, data_validate))
        mu = data.mean(axis=0)
        s = data.std(axis=0)
        data_train = (data_train - mu) / s
        data_validate = (data_validate - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_validate, data_test

    return load_data_normalised(
        path=os.path.join(utils.get_data_root(), 'miniboone', 'data.npy'))
Пример #6
0
def load_gas():
    def load_data(file):
        data = pd.read_pickle(file)
        data.drop("Meth", axis=1, inplace=True)
        data.drop("Eth", axis=1, inplace=True)
        data.drop("Time", axis=1, inplace=True)
        return data

    def get_correlation_numbers(data):
        C = data.corr()
        A = C > 0.98
        B = A.sum(axis=1)
        return B

    def load_data_and_clean(file):
        data = load_data(file)
        B = get_correlation_numbers(data)

        while np.any(B > 1):
            col_to_remove = np.where(B > 1)[0][0]
            col_name = data.columns[col_to_remove]
            data.drop(col_name, axis=1, inplace=True)
            B = get_correlation_numbers(data)
        data = (data - data.mean()) / data.std()

        return data.values

    def load_data_and_clean_and_split(file):
        data = load_data_and_clean(file)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data_train = data[0:-N_test]
        N_validate = int(0.1 * data_train.shape[0])
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    return load_data_and_clean_and_split(
        file=os.path.join(utils.get_data_root(), 'gas', 'ethylene_CO.pickle'))
Пример #7
0
 def load_data():
     file = os.path.join(utils.get_data_root(), 'power', 'data.npy')
     return np.load(file)
Пример #8
0
def load_hepmass():
    def load_data(path):

        data_train = pd.read_csv(filepath_or_buffer=os.path.join(path, '1000_train.csv'),
                                 index_col=False)
        data_test = pd.read_csv(filepath_or_buffer=os.path.join(path, '1000_test.csv'),
                                index_col=False)

        return data_train, data_test

    def load_data_no_discrete(path):
        """Loads the positive class examples from the first 10% of the dataset."""
        data_train, data_test = load_data(path)

        # Gets rid of any background noise examples i.e. class label 0.
        data_train = data_train[data_train[data_train.columns[0]] == 1]
        data_train = data_train.drop(data_train.columns[0], axis=1)
        data_test = data_test[data_test[data_test.columns[0]] == 1]
        data_test = data_test.drop(data_test.columns[0], axis=1)
        # Because the data_ set is messed up!
        data_test = data_test.drop(data_test.columns[-1], axis=1)

        return data_train, data_test

    def load_data_no_discrete_normalised(path):

        data_train, data_test = load_data_no_discrete(path)
        mu = data_train.mean()
        s = data_train.std()
        data_train = (data_train - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_test

    def load_data_no_discrete_normalised_as_array(path):

        data_train, data_test = load_data_no_discrete_normalised(path)
        data_train, data_test = data_train.values, data_test.values

        i = 0
        # Remove any features that have too many re-occurring real values.
        features_to_remove = []
        for feature in data_train.T:
            c = Counter(feature)
            max_count = np.array([v for k, v in sorted(c.items())])[0]
            if max_count > 5:
                features_to_remove.append(i)
            i += 1
        data_train = data_train[:, np.array(
            [i for i in range(data_train.shape[1]) if i not in features_to_remove])]
        data_test = data_test[:, np.array(
            [i for i in range(data_test.shape[1]) if i not in features_to_remove])]

        N = data_train.shape[0]
        N_validate = int(N * 0.1)
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    return load_data_no_discrete_normalised_as_array(
        path=os.path.join(utils.get_data_root(), 'hepmass')
    )
Пример #9
0
def load_bsds300():
    path = os.path.join(utils.get_data_root(), 'bsds300', 'bsds300.hdf5')
    file = h5py.File(path, 'r')
    return file['train'], file['validation'], file['test']