def create_data_loader(X, y=None, batch_size=1):
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()
    if isinstance(X, np.ndarray):
        dataset = NumpyTorchDataset(X, y)
    if isinstance(X, str):  #Assume that this is path to hdf5 file
        dataset = HDF5TorchDataset(X)
    return DataLoader(dataset, batch_size=batch_size)
Пример #2
0
def create_data_loader(X, y=None, batch_size=1):
    import torch
    from torch.utils.data import DataLoader, TensorDataset

    from lale.util.batch_data_dictionary_dataset import BatchDataDict
    from lale.util.hdf5_to_torch_dataset import HDF5TorchDataset
    from lale.util.numpy_to_torch_dataset import NumpyTorchDataset

    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()
        dataset = NumpyTorchDataset(X, y)
    elif isinstance(X, scipy.sparse.csr.csr_matrix):
        # unfortunately, NumpyTorchDataset won't accept a subclass of np.ndarray
        X = X.toarray()
        if isinstance(y, lale.datasets.data_schemas.NDArrayWithSchema):
            y = y.view(np.ndarray)
        dataset = NumpyTorchDataset(X, y)
    elif isinstance(X, np.ndarray):
        # unfortunately, NumpyTorchDataset won't accept a subclass of np.ndarray
        if isinstance(X, lale.datasets.data_schemas.NDArrayWithSchema):
            X = X.view(np.ndarray)
        if isinstance(y, lale.datasets.data_schemas.NDArrayWithSchema):
            y = y.view(np.ndarray)
        dataset = NumpyTorchDataset(X, y)
    elif isinstance(X, str):  # Assume that this is path to hdf5 file
        dataset = HDF5TorchDataset(X)
    elif isinstance(X, BatchDataDict):
        dataset = X

        def my_collate_fn(batch):
            return batch[
                0
            ]  # because BatchDataDict's get_item returns a batch, so no collate is required.

        return DataLoader(dataset, batch_size=1, collate_fn=my_collate_fn)
    elif isinstance(X, dict):  # Assumed that it is data indexed by batch number
        return [X]
    elif isinstance(X, torch.Tensor) and y is not None:
        if isinstance(y, np.ndarray):
            y = torch.from_numpy(y)
        dataset = TensorDataset(X, y)
    elif isinstance(X, torch.Tensor):
        dataset = TensorDataset(X)
    else:
        raise TypeError(
            "Can not create a data loader for a dataset with type {}".format(type(X))
        )
    return DataLoader(dataset, batch_size=batch_size)
Пример #3
0
def create_data_loader(X, y = None, batch_size = 1):
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()
    elif isinstance(X, np.ndarray):
        dataset = NumpyTorchDataset(X, y)
    elif isinstance(X, str):#Assume that this is path to hdf5 file
        dataset = HDF5TorchDataset(X)
    elif isinstance(X, dict): #Assumed that it is data indexed by batch number
        #dataset = BatchDataDictDataset(X)
        return X.values()
    elif isinstance(X, torch.Tensor) and y is not None:
        dataset = TensorDataset(X, y)
    elif isinstance(X, torch.Tensor):
        dataset = TensorDataset(X)
    else:
        raise TypeError("Can not create a data loader for a dataset with type {}".format(type(X)))
    return DataLoader(dataset, batch_size=batch_size)
Пример #4
0
def create_data_loader(X, y=None, batch_size=1, num_workers=0):
    """A function that takes a dataset as input and outputs a Pytorch dataloader.

    Parameters
    ----------
    X : Input data.
        The formats supported are Pandas DataFrame, Numpy array,
        a sparse matrix, torch.tensor, torch.utils.data.Dataset, path to a HDF5 file,
        lale.util.batch_data_dictionary_dataset.BatchDataDict,
        a Python dictionary of the format `{"dataset": torch.utils.data.Dataset,
        "collate_fn":collate_fn for torch.utils.data.DataLoader}`
    y : Labels., optional
        Supported formats are Numpy array or Pandas series, by default None
    batch_size : int, optional
        Number of samples in each batch, by default 1
    num_workers : int, optional
        Number of workers used by the data loader, by default 0

    Returns
    -------
    torch.utils.data.DataLoader

    Raises
    ------
    TypeError
        Raises a TypeError if the input format is not supported.
    """
    import torch
    from torch.utils.data import DataLoader, Dataset, TensorDataset

    from lale.util.batch_data_dictionary_dataset import BatchDataDict
    from lale.util.hdf5_to_torch_dataset import HDF5TorchDataset
    from lale.util.numpy_to_torch_dataset import NumpyTorchDataset

    collate_fn = None
    worker_init_fn = None

    def numpy_collate_fn(batch):
        return_X = None
        return_y = None
        for item in batch:
            if isinstance(item, tuple):
                if return_X is None:
                    return_X = item[0]
                else:
                    return_X = np.vstack((return_X, item[0]))
                if return_y is None:
                    return_y = item[1]
                else:
                    return_y = np.vstack((return_y, item[1]))
            else:
                if return_X is None:
                    return_X = item
                else:
                    return_X = np.vstack((return_X, item))
        if return_y is not None:
            if len(return_y.shape) > 1 and return_y.shape[1] == 1:
                return_y = np.reshape(return_y, (len(return_y), ))
            return return_X, return_y
        else:
            return return_X

    if isinstance(X, Dataset):
        dataset = X
    elif isinstance(X, pd.DataFrame):
        X = X.to_numpy()
        if isinstance(y, pd.Series):
            y = y.to_numpy()
        dataset = NumpyTorchDataset(X, y)
        collate_fn = numpy_collate_fn
    elif isinstance(X, scipy.sparse.csr.csr_matrix):
        # unfortunately, NumpyTorchDataset won't accept a subclass of np.ndarray
        X = X.toarray()
        if isinstance(y, lale.datasets.data_schemas.NDArrayWithSchema):
            y = y.view(np.ndarray)
        dataset = NumpyTorchDataset(X, y)
        collate_fn = numpy_collate_fn
    elif isinstance(X, np.ndarray):
        # unfortunately, NumpyTorchDataset won't accept a subclass of np.ndarray
        if isinstance(X, lale.datasets.data_schemas.NDArrayWithSchema):
            X = X.view(np.ndarray)
        if isinstance(y, lale.datasets.data_schemas.NDArrayWithSchema):
            y = y.view(np.ndarray)
        dataset = NumpyTorchDataset(X, y)
        collate_fn = numpy_collate_fn
    elif isinstance(X, str):  # Assume that this is path to hdf5 file
        dataset = HDF5TorchDataset(X)
    elif isinstance(X, BatchDataDict):
        dataset = X

        def my_collate_fn(batch):
            return batch[
                0]  # because BatchDataDict's get_item returns a batch, so no collate is required.

        return DataLoader(dataset, batch_size=1, collate_fn=my_collate_fn)
    elif isinstance(X,
                    dict):  # Assumed that it is data indexed by batch number
        if "dataset" in X:
            dataset = X["dataset"]
            collate_fn = X.get("collate_fn", None)
            worker_init_fn = getattr(dataset, "worker_init_fn", None)
        else:
            return [X]
    elif isinstance(X, torch.Tensor) and y is not None:
        if isinstance(y, np.ndarray):
            y = torch.from_numpy(y)
        dataset = TensorDataset(X, y)
    elif isinstance(X, torch.Tensor):
        dataset = TensorDataset(X)
    else:
        raise TypeError(
            "Can not create a data loader for a dataset with type {}".format(
                type(X)))
    return DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        num_workers=num_workers,
        worker_init_fn=worker_init_fn,
    )