示例#1
0
def test_does_not_add_buffer_when_enough_obs_long_data():
    bg = BufferGenerator(n_observations=100, data_shape='long',
                         buffer_size=10)

    d = np.ones((100, 10))

    index = (slice(20, 30, None), slice(None))

    (index_new, (buff_start, buff_end)) = bg.update_key_with_buffer(index)

    subset = bg.add_buffer(d[index_new], buff_start, buff_end)

    assert subset.shape == (30, 10)
    assert subset.sum() == 300
示例#2
0
def test_updates_slice_when_not_enough_obs_start_end():
    bg = BufferGenerator(n_observations=100, data_shape='long',
                         buffer_size=10)

    obs_slice = slice(5, 98, None)
    ch_slice = slice(None, None, None)
    key = (obs_slice, ch_slice)

    ((obs_slice_new, ch_slice_new),
     (buff_start, buff_end)) = bg.update_key_with_buffer(key)

    assert obs_slice_new == slice(0, 100, None)
    assert ch_slice_new == ch_slice
    assert buff_start == 5 and buff_end == 8
示例#3
0
def test_adds_start_buffer_long():
    bg = BufferGenerator(n_observations=100, data_shape='long',
                         buffer_size=10)

    d = np.ones((100, 10))

    index = (slice(0, 10, None), slice(None))

    (index_new, (buff_start, buff_end)) = bg.update_key_with_buffer(index)

    subset = bg.add_buffer(d[index_new], buff_start, buff_end)

    assert subset.shape == (30, 10)
    assert subset.sum() == 200
    assert subset[:10, :].sum() == 0
    assert subset[10:, :].sum() == 200
示例#4
0
    def __init__(self, path_to_recordings, dtype, n_channels,
                 data_format, max_memory, buffer_size=0):
        self.data_format = data_format
        self.buffer_size = buffer_size
        self.reader = RecordingsReader(path_to_recordings, dtype, n_channels,
                                       data_format,
                                       output_shape='long')
        self.indexer = IndexGenerator(self.reader.observations,
                                      self.reader.channels,
                                      dtype,
                                      max_memory)

        # data format is long since reader will return data in that format
        self.buffer_generator = BufferGenerator(self.reader.observations,
                                                data_format='long',
                                                buffer_size=buffer_size)

        self.logger = logging.getLogger(__name__)
示例#5
0
def test_does_not_add_buffer_when_enough_obs_wide_data():
    bg = BufferGenerator(n_observations=100, data_shape='wide',
                         buffer_size=10)

    d = np.ones((10, 100))

    # index is in observations, channels format
    index = (slice(20, 30, None), slice(None))

    (index_new, (buff_start, buff_end)) = bg.update_key_with_buffer(index)

    # reverse to match 'wide' data
    index_new = index_new[::-1]

    subset = bg.add_buffer(d[index_new], buff_start, buff_end)

    assert subset.shape == (10, 30)
    assert subset.sum() == 300
示例#6
0
def test_adds_end_buffer_wide():
    bg = BufferGenerator(n_observations=100, data_shape='wide',
                         buffer_size=10)

    d = np.ones((10, 100))

    index = (slice(90, 100, None), slice(None))

    (index_new, (buff_start, buff_end)) = bg.update_key_with_buffer(index)

    # reverse to match 'wide' data
    index_new = index_new[::-1]

    subset = bg.add_buffer(d[index_new], buff_start, buff_end)

    assert subset.shape == (10, 30)
    assert subset.sum() == 200
    assert subset[:, :20].sum() == 200
    assert subset[:, 20:].sum() == 0
示例#7
0
class BatchProcessor(object):
    """
    Batch processing for large numpy matrices

    Parameters
    ----------
    path_to_recordings: str
        Path to recordings file
    dtype: str
        Numpy dtype
    n_channels: int
        Number of channels
    data_format: str
        Data format, it can be either 'long' (observations, channels) or
        'wide' (channels, observations)
    max_memory: int or str
        Max memory to use in each batch, interpreted as bytes if int,
        if string, it can be any of {N}KB, {N}MB or {N}GB

    buffer_size: int, optional
        Buffer size, defaults to 0. Only relevant when performing multi-channel
        operations

    Raises
    ------
    ValueError
        If dimensions do not match according to the file size, dtype and
        number of channels
    """

    def __init__(self, path_to_recordings, dtype, n_channels,
                 data_format, max_memory, buffer_size=0):
        self.data_format = data_format
        self.buffer_size = buffer_size
        self.reader = RecordingsReader(path_to_recordings, dtype, n_channels,
                                       data_format,
                                       output_shape='long')
        self.indexer = IndexGenerator(self.reader.observations,
                                      self.reader.channels,
                                      dtype,
                                      max_memory)

        # data format is long since reader will return data in that format
        self.buffer_generator = BufferGenerator(self.reader.observations,
                                                data_format='long',
                                                buffer_size=buffer_size)

        self.logger = logging.getLogger(__name__)

    def single_channel(self, force_complete_channel_batch=True, from_time=None,
                       to_time=None, channels='all'):
        """
        Generate indexes where each index has observations from a single
        channel

        Returns
        -------
        A generator that yields indexes

        Examples
        --------

        .. literalinclude:: ../../examples/batch/single_channel.py
        """
        indexes = self.indexer.single_channel(force_complete_channel_batch,
                                              from_time, to_time,
                                              channels)
        if force_complete_channel_batch:
            for idx in indexes:
                yield self.reader[idx]
        else:
            for idx in indexes:
                channel_idx = idx[1]
                yield self.reader[idx], channel_idx

    def multi_channel(self, from_time=None, to_time=None, channels='all'):
        """
        Generate indexes where each index has observations from more than
        one channel

        Returns
        -------
        generator:
            A tuple of size three: the first element is the subset of the data
            for the ith batch, second element is the slice object with the
            limits of the data in [observations, channels] format (excluding
            the buffer), the last element is the absolute index of the data
            again in [observations, channels] format

        Examples
        --------
        .. literalinclude:: ../../examples/batch/multi_channel.py
        """
        indexes = self.indexer.multi_channel(from_time, to_time, channels)

        for idx in indexes:
            obs_idx = idx[0]
            data_idx = (slice(self.buffer_size,
                              obs_idx.stop - obs_idx.start + self.buffer_size,
                              obs_idx.step), slice(None, None, None))

            if self.buffer_size:
                (idx_new,
                 (buff_start, buff_end)) = (self.buffer_generator
                                            .update_key_with_buffer(idx))
                subset = self.reader[idx_new]
                subset_buff = self.buffer_generator.add_buffer(subset,
                                                               buff_start,
                                                               buff_end)
                yield subset_buff, data_idx, idx
            else:
                yield self.reader[idx], data_idx, idx

    def single_channel_apply(self, function, mode, output_path=None,
                             force_complete_channel_batch=True,
                             from_time=None, to_time=None, channels='all',
                             if_file_exists='overwrite', cast_dtype=None,
                             **kwargs):
        """
        Apply a transformation where each batch has observations from a
        single channel

        Parameters
        ----------
        function: callable
            Function to be applied, must accept a 1D numpy array as its first
            parameter
        mode: str
            'disk' or 'memory', if 'disk', a binary file is created at the
            beginning of the operation and each partial result is saved
            (ussing numpy.ndarray.tofile function), at the end of the
            operation two files are generated: the binary file and a yaml
            file with some file parameters (useful if you want to later use
            RecordingsReader to read the file). If 'memory', partial results
            are kept in memory and returned as a list
        output_path: str, optional
            Where to save the output, required if 'disk' mode
        force_complete_channel_batch: bool, optional
            If True, every index generated will correspond to all the
            observations in a single channel, hence
            n_batches = n_selected_channels, defaults to True. If True
            from_time and to_time must be None
        from_time: int, optional
            Starting time, defaults to None
        to_time: int, optional
            Ending time, defaults to None
        channels: int, tuple or str, optional
            A tuple with the channel indexes or 'all' to traverse all channels,
            defaults to 'all'
        if_file_exists: str, optional
            One of 'overwrite', 'abort', 'skip'. If 'overwrite' it replaces the
            file if it exists, if 'abort' if raise a ValueError exception if
            the file exists, if 'skip' if skips the operation if the file
            exists. Only valid when mode = 'disk'
        cast_dtype: str, optional
            Output dtype, defaults to None which means no cast is done
        **kwargs
            kwargs to pass to function

        Examples
        --------

        .. literalinclude:: ../../examples/batch/single_channel_apply.py

        Notes
        -----
        When applying functions in 'disk' mode will incur in memory overhead,
        which depends on the function implementation, this is an important
        thing to consider if the transformation changes the data's dtype (e.g.
        converts int16 to float64), which means that a chunk of 1MB in int16
        will have a size of 4MB in float64. Take that into account when
        setting max_memory.

        For performance reasons in 'disk' mode, output data is in 'wide' format
        """
        if mode not in ['disk', 'memory']:
            raise ValueError('Mode should be disk or memory, received: {}'
                             .format(mode))

        if mode == 'disk' and output_path is None:
            raise ValueError('output_path is required in "disk" mode')

        if (mode == 'disk' and if_file_exists == 'abort' and
           os.path.exists(output_path)):
            raise ValueError('{} already exists'.format(output_path))

        if (mode == 'disk' and if_file_exists == 'skip' and
           os.path.exists(output_path)):
            # load params...
            path_to_yaml = output_path.replace('.bin', '.yaml')

            if not os.path.exists(path_to_yaml):
                raise ValueError("if_file_exists = 'skip', but {}"
                                 " is missing, aborting..."
                                 .format(path_to_yaml))

            with open(path_to_yaml) as f:
                params = yaml.load(f)

            self.logger.info('{} exists, skiping...'.format(output_path))

            return output_path, params

        self.logger.info('Applying function {}...'
                         .format(function_path(function)))

        if mode == 'disk':
            fn = self._single_channel_apply_disk

            start = time.time()
            res = fn(function, output_path,
                     force_complete_channel_batch, from_time,
                     to_time, channels, cast_dtype, **kwargs)
            elapsed = time.time() - start
            self.logger.info('{} took {}'
                             .format(function_path(function),
                                     human_readable_time(elapsed)))
            return res
        else:
            fn = self._single_channel_apply_memory

            start = time.time()
            res = fn(function, force_complete_channel_batch, from_time,
                     to_time, channels, cast_dtype, **kwargs)
            elapsed = time.time() - start
            self.logger.info('{} took {}'
                             .format(function_path(function),
                                     human_readable_time(elapsed)))
            return res

    def multi_channel_apply(self, function, mode, cleanup_function=None,
                            output_path=None, from_time=None, to_time=None,
                            channels='all', if_file_exists='overwrite',
                            cast_dtype=None, **kwargs):
        """
        Apply a function where each batch has observations from more than
        one channel

        Parameters
        ----------
        function: callable
            Function to be applied, must accept a 2D numpy array in 'long'
            format as its first parameter (number of observations, number of
            channels)
        mode: str
            'disk' or 'memory', if 'disk', a binary file is created at the
            beginning of the operation and each partial result is saved
            (ussing numpy.ndarray.tofile function), at the end of the
            operation two files are generated: the binary file and a yaml
            file with some file parameters (useful if you want to later use
            RecordingsReader to read the file). If 'memory', partial results
            are kept in memory and returned as a list
        cleanup_function: callable, optional
            A function to be executed after `function` and before adding the
            partial result to the list of results (if `memory` mode) or to the
            biinary file (if in `disk mode`)
        output_path: str, optional
            Where to save the output, required if 'disk' mode
        force_complete_channel_batch: bool, optional
            If True, every index generated will correspond to all the
            observations in a single channel, hence
            n_batches = n_selected_channels, defaults to True. If True
            from_time and to_time must be None
        from_time: int, optional
            Starting time, defaults to None
        to_time: int, optional
            Ending time, defaults to None
        channels: int, tuple or str, optional
            A tuple with the channel indexes or 'all' to traverse all channels,
            defaults to 'all'
        if_file_exists: str, optional
            One of 'overwrite', 'abort', 'skip'. If 'overwrite' it replaces the
            file if it exists, if 'abort' if raise a ValueError exception if
            the file exists, if 'skip' if skips the operation if the file
            exists. Only valid when mode = 'disk'
        cast_dtype: str, optional
            Output dtype, defaults to None which means no cast is done
        **kwargs
            kwargs to pass to function

        Returns
        -------
        output_path
            Path to output binary file
        params
            Binary file params

        Examples
        --------

        .. literalinclude:: ../../examples/batch/multi_channel_apply.py

        Notes
        -----
        Applying functions will incur in memory overhead, which depends
        on the function implementation, this is an important thing to consider
        if the transformation changes the data's dtype (e.g. converts int16 to
        float64), which means that a chunk of 1MB in int16 will have a size
        of 4MB in float64. Take that into account when setting max_memory

        For performance reasons, outputs data in 'long' format.
        """
        if mode not in ['disk', 'memory']:
            raise ValueError('Mode should be disk or memory, received: {}'
                             .format(mode))

        if mode == 'disk' and output_path is None:
            raise ValueError('output_path is required in "disk" mode')

        if (mode == 'disk' and if_file_exists == 'abort' and
           os.path.exists(output_path)):
            raise ValueError('{} already exists'.format(output_path))

        self.logger.info('Applying function {}...'
                         .format(function_path(function)))

        if (mode == 'disk' and if_file_exists == 'skip' and
           os.path.exists(output_path)):
            # load params...
            path_to_yaml = output_path.replace('.bin', '.yaml')

            if not os.path.exists(path_to_yaml):
                raise ValueError("if_file_exists = 'skip', but {}"
                                 " is missing, aborting..."
                                 .format(path_to_yaml))

            with open(path_to_yaml) as f:
                params = yaml.load(f)

            self.logger.info('{} exists, skiping...'.format(output_path))

            return output_path, params

        if mode == 'disk':
            fn = self._multi_channel_apply_disk

            start = time.time()
            res = fn(function, cleanup_function, output_path, from_time,
                     to_time, channels, cast_dtype, **kwargs)
            elapsed = time.time() - start
            self.logger.info('{} took {}'
                             .format(function_path(function),
                                     human_readable_time(elapsed)))
            return res
        else:
            fn = self._multi_channel_apply_memory

            start = time.time()
            res = fn(function, cleanup_function, from_time, to_time, channels,
                     cast_dtype, **kwargs)
            elapsed = time.time() - start
            self.logger.info('{} took {}'
                             .format(function_path(function),
                                     human_readable_time(elapsed)))
            return res

    def _single_channel_apply_disk(self, function, output_path,
                                   force_complete_channel_batch, from_time,
                                   to_time, channels, cast_dtype, **kwargs):
        f = open(output_path, 'wb')

        self.reader.output_shape = 'wide'
        indexes = self.indexer.single_channel(force_complete_channel_batch,
                                              from_time, to_time,
                                              channels)
        for i, idx in enumerate(indexes):
            self.logger.debug('Processing channel {}...'.format(i))
            self.logger.debug('Reading batch...')
            subset = self.reader[idx]

            if cast_dtype is None:
                res = function(subset, **kwargs)
            else:
                res = function(subset, **kwargs).astype(cast_dtype)

            self.logger.debug('Writing to disk...')
            res.tofile(f)

        dtype = str(res.dtype)

        if channels == 'all':
            n_channels = self.reader.channels
        elif isinstance(channels, numbers.Integral):
            n_channels = 1
        else:
            n_channels = len(channels)

        f.close()

        # save yaml file with params
        path_to_yaml = output_path.replace('.bin', '.yaml')

        params = dict(dtype=dtype, n_channels=n_channels, data_format='wide')

        with open(path_to_yaml, 'w') as f:
            self.logger.debug('Saving params...')
            yaml.dump(params, f)

        return output_path, params

    def _multi_channel_apply_disk(self, function, cleanup_function,
                                  output_path, from_time, to_time, channels,
                                  cast_dtype, **kwargs):
        f = open(output_path, 'wb')

        self.reader.output_shape = 'long'
        data = self.multi_channel(from_time, to_time, channels)

        for subset, idx_local, idx in data:

            if cast_dtype is None:
                res = function(subset, **kwargs)
            else:
                res = function(subset, **kwargs).astype(cast_dtype)

            if cleanup_function:
                res = cleanup_function(res, idx_local, idx, self.buffer_size)

            res.tofile(f)

        dtype = str(res.dtype)

        f.close()

        if channels == 'all':
            n_channels = self.reader.channels
        elif isinstance(channels, numbers.Integral):
            n_channels = 1
        else:
            n_channels = len(channels)

        # save yaml file with params
        path_to_yaml = output_path.replace('.bin', '.yaml')

        params = dict(dtype=dtype, n_channels=n_channels, data_format='long')

        with open(path_to_yaml, 'w') as f:
            yaml.dump(params, f)

        return output_path, params

    def _single_channel_apply_memory(self, function,
                                     force_complete_channel_batch, from_time,
                                     to_time, channels, cast_dtype, **kwargs):

        indexes = self.indexer.single_channel(force_complete_channel_batch,
                                              from_time, to_time,
                                              channels)
        results = []

        for i, idx in enumerate(indexes):
            self.logger.debug('Processing channel {}...'.format(i))
            self.logger.debug('Reading batch...')
            subset = self.reader[idx]

            if cast_dtype is None:
                res = function(subset, **kwargs)
            else:
                res = function(subset, **kwargs).astype(cast_dtype)

            self.logger.debug('Appending partial result...')
            results.append(res)

        return results

    def _multi_channel_apply_memory(self, function, cleanup_function,
                                    from_time, to_time, channels, cast_dtype,
                                    **kwargs):

        data = self.multi_channel(from_time, to_time, channels)
        results = []

        for subset, idx_local, idx in data:

            if cast_dtype is None:
                res = function(subset, **kwargs)
            else:
                res = function(subset, **kwargs).astype(cast_dtype)

            if cleanup_function:
                res = cleanup_function(res, idx_local, idx, self.buffer_size)

            results.append(res)

        return results
示例#8
0
文件: reader.py 项目: kathefter/yass
class RecordingsReader(object):
    """
    Neural recordings reader. If a file with the same name but yaml extension
    exists in the directory it looks for dtype, channels and data_order,
    otherwise you need to pass the parameters in the constructor

    Parameters
    ----------
    path_to_recordings: str
        Path to recordings file

    dtype: str
        Numpy dtype

    n_channels: int
        Number of channels

    data_order: str
        Recordings order, one of ('channels', 'samples'). In a dataset with k
        observations per channel and j channels: 'channels' means first k
        contiguous observations come from channel 0, then channel 1, and so
        on. 'sample' means first j contiguous data are the first observations
        from all channels, then the second observations from all channels and
        so on

    loader: str ('memmap', 'array' or 'python'), optional
        How to load the data. memmap loads the data using a wrapper around
        np.memmap (see :class:`~yass.batch.MemoryMap` for details), 'array'
        using numpy.fromfile and 'python' loads it using a wrapper
        around Python file API. Defaults to 'python'. Beware that the Python
        loader has limited indexing capabilities, see
        :class:`~yass.batch.BinaryReader` for details

    buffer_size: int, optional
        Adds buffer

    return_data_index: bool, optional
        If True, a tuple will be returned when indexing: the first element will
        be the data and the second the index corresponding to the actual data
        (excluding bufffer), when buffer is equal to zero, this just returns
        they original index since there is no buffer

    Raises
    ------
    ValueError
        If dimensions do not match according to the file size, dtype and
        number of channels

    Notes
    -----
    This is just an utility class to index binary files in a consistent way,
    it does not matter the order of the file ('channels' or 'samples'),
    indexing is performed in [observations, channels] format. This class is
    mainly used by other internal YASS classes to maintain a consistent
    indexing order.

    Examples
    --------

    .. literalinclude:: ../../examples/batch/reader.py
    """
    def __init__(self,
                 path_to_recordings,
                 dtype=None,
                 n_channels=None,
                 data_order=None,
                 loader='memmap',
                 buffer_size=0,
                 return_data_index=False):

        path_to_recordings = str(path_to_recordings)
        path_to_yaml = str(path_to_recordings).replace('.bin', '.yaml')

        if (not os.path.isfile(path_to_yaml) and
            (dtype is None or n_channels is None or data_order is None)):
            raise ValueError(
                'At least one of: dtype, channels or data_order '
                'are None, this is only allowed when a yaml '
                'file is present in the same location as '
                'the bin file, but no {} file exists'.format(path_to_yaml))
        elif (os.path.isfile(path_to_yaml) and dtype is None
              and n_channels is None and data_order is None):
            with open(path_to_yaml) as f:
                params = yaml.load(f)

            dtype = params['dtype']
            n_channels = params['n_channels']
            data_order = params['data_order']

        self._data_order = data_order
        self._n_channels = n_channels
        self._dtype = dtype if not isinstance(dtype, str) else np.dtype(dtype)
        self.buffer_size = buffer_size
        self.return_data_index = return_data_index

        filesize = os.path.getsize(path_to_recordings)

        if not (filesize / self._dtype.itemsize).is_integer():
            raise ValueError('Wrong filesize and/or dtype, filesize {:, }'
                             'bytes is not divisible by the item size {}'
                             ' bytes'.format(filesize, self._dtype.itemsize))

        if int(filesize / self._dtype.itemsize) % n_channels:
            raise ValueError('Wrong n_channels, length of the data does not '
                             'match number of n_channels (observations % '
                             'n_channels != 0, verify that the number of '
                             'n_channels and/or the dtype are correct')

        self._n_observations = int(filesize / self._dtype.itemsize /
                                   n_channels)

        if self.buffer_size:
            # data format is long since reader will return data in that format
            self.buffer_generator = BufferGenerator(self._n_observations,
                                                    data_shape='long',
                                                    buffer_size=buffer_size)

        if loader not in ['memmap', 'array', 'python']:
            raise ValueError("loader must be one of 'memmap', 'array' or "
                             "'python'")

        # if data is in channels order, we will read as "columns first",
        # if data is ith sample order, we will read as as "rows first",
        # this ensures we have a consistent index array[observations, channels]
        order = dict(channels='F', samples='C')

        shape = self._n_observations, n_channels

        def fromfile(path, dtype, data_order, shape):
            if data_order == 'samples':
                return np.fromfile(path, dtype=dtype).reshape(shape)
            else:
                return np.fromfile(path, dtype=dtype).reshape(shape[::-1]).T

        if loader in ['memmap', 'array']:
            fn = (partial(
                MemoryMap, mode='r', shape=shape, order=order[data_order])
                  if loader == 'memmap' else partial(
                      fromfile, data_order=data_order, shape=shape))
            self._data = fn(path_to_recordings, dtype=self._dtype)

            if loader == 'array':
                self._data = self._data.reshape(shape)
        else:
            self._data = BinaryReader(path_to_recordings,
                                      dtype,
                                      shape,
                                      order=order[data_order])

    def __getitem__(self, key):

        # this happens when doung something like
        # x[[1,2,3]] or x[np.array([1,2,3])]
        if not isinstance(key, tuple):
            key = (key, slice(None))

        obs_idx, _ = key

        # index where the data is located (excluding buffer)
        start = obs_idx.start or 0
        stop = obs_idx.stop or self.observations

        # build indexes for observations
        idx = slice(self.buffer_size, stop - start + self.buffer_size,
                    obs_idx.step)
        # buffer is added to all channels
        ch_idx = slice(None, None, None)
        data_idx = (idx, ch_idx)

        if self.buffer_size:
            # modify indexes to include buffered data
            (idx_new,
             (buff_start,
              buff_end)) = (self.buffer_generator.update_key_with_buffer(key))
            subset = self._data[idx_new]

            # add zeros if needed (start or end of the data)
            subset_buff = self.buffer_generator.add_buffer(
                subset, buff_start, buff_end)

            return ((subset_buff,
                     data_idx) if self.return_data_index else subset_buff)
        else:
            subset = self._data[key]
            return (subset, data_idx) if self.return_data_index else subset

    def __repr__(self):
        return ('Reader for recordings with {:,} observations and {:,} '
                'channels in "{}" format'.format(self.observations,
                                                 self.channels,
                                                 self._data_order))

    @property
    def shape(self):
        """Data shape in (observations, channels) format
        """
        return self._data.shape

    @property
    def observations(self):
        """Number of observations
        """
        return self._n_observations

    @property
    def channels(self):
        """Number of channels
        """
        return self._n_channels

    @property
    def data_order(self):
        """Data order
        """
        return self._data_order

    @property
    def dtype(self):
        """Numpy's dtype
        """
        return self._dtype

    @property
    def data(self):
        """Underlying numpy data
        """
        return self._data
示例#9
0
文件: reader.py 项目: kathefter/yass
    def __init__(self,
                 path_to_recordings,
                 dtype=None,
                 n_channels=None,
                 data_order=None,
                 loader='memmap',
                 buffer_size=0,
                 return_data_index=False):

        path_to_recordings = str(path_to_recordings)
        path_to_yaml = str(path_to_recordings).replace('.bin', '.yaml')

        if (not os.path.isfile(path_to_yaml) and
            (dtype is None or n_channels is None or data_order is None)):
            raise ValueError(
                'At least one of: dtype, channels or data_order '
                'are None, this is only allowed when a yaml '
                'file is present in the same location as '
                'the bin file, but no {} file exists'.format(path_to_yaml))
        elif (os.path.isfile(path_to_yaml) and dtype is None
              and n_channels is None and data_order is None):
            with open(path_to_yaml) as f:
                params = yaml.load(f)

            dtype = params['dtype']
            n_channels = params['n_channels']
            data_order = params['data_order']

        self._data_order = data_order
        self._n_channels = n_channels
        self._dtype = dtype if not isinstance(dtype, str) else np.dtype(dtype)
        self.buffer_size = buffer_size
        self.return_data_index = return_data_index

        filesize = os.path.getsize(path_to_recordings)

        if not (filesize / self._dtype.itemsize).is_integer():
            raise ValueError('Wrong filesize and/or dtype, filesize {:, }'
                             'bytes is not divisible by the item size {}'
                             ' bytes'.format(filesize, self._dtype.itemsize))

        if int(filesize / self._dtype.itemsize) % n_channels:
            raise ValueError('Wrong n_channels, length of the data does not '
                             'match number of n_channels (observations % '
                             'n_channels != 0, verify that the number of '
                             'n_channels and/or the dtype are correct')

        self._n_observations = int(filesize / self._dtype.itemsize /
                                   n_channels)

        if self.buffer_size:
            # data format is long since reader will return data in that format
            self.buffer_generator = BufferGenerator(self._n_observations,
                                                    data_shape='long',
                                                    buffer_size=buffer_size)

        if loader not in ['memmap', 'array', 'python']:
            raise ValueError("loader must be one of 'memmap', 'array' or "
                             "'python'")

        # if data is in channels order, we will read as "columns first",
        # if data is ith sample order, we will read as as "rows first",
        # this ensures we have a consistent index array[observations, channels]
        order = dict(channels='F', samples='C')

        shape = self._n_observations, n_channels

        def fromfile(path, dtype, data_order, shape):
            if data_order == 'samples':
                return np.fromfile(path, dtype=dtype).reshape(shape)
            else:
                return np.fromfile(path, dtype=dtype).reshape(shape[::-1]).T

        if loader in ['memmap', 'array']:
            fn = (partial(
                MemoryMap, mode='r', shape=shape, order=order[data_order])
                  if loader == 'memmap' else partial(
                      fromfile, data_order=data_order, shape=shape))
            self._data = fn(path_to_recordings, dtype=self._dtype)

            if loader == 'array':
                self._data = self._data.reshape(shape)
        else:
            self._data = BinaryReader(path_to_recordings,
                                      dtype,
                                      shape,
                                      order=order[data_order])