Пример #1
0
def defaultHeader( ):
    """
    Returns a default MRC header dictionary with all fields with default values.
    """
    header = {}
    header['fileConvention'] = "imod"
    header['endian'] = 'le'
    header['MRCtype'] = 0
    header['dimensions'] = np.array( [0,0,0], dtype=int )
    header['dtype'] = 'u1'
    
    header['compressor'] = None
    header['packedBytes'] = 0
    header['clevel'] = 1
    
    header['maxImage'] = 1.0
    header['minImage'] = 0.0
    header['meanImage'] = 0.0
    
    header['pixelsize'] = 0.1 
    header['pixelunits'] = u"nm" # Can be "\AA" for Angstroms
    header['voltage'] = 300.0 # kV
    header['C3'] = 2.7 # mm
    header['gain'] = 1.0 # counts/electron
    
    if bloscPresent:
        header['n_threads'] = blosc.detect_number_of_cores()
    
    return header
Пример #2
0
def defaultHeader():
    '''
    Returns a default MRC header dictionary with all fields with default values.
    '''
    header = {}
    header['fileConvention'] = 'ccpem'
    header['endian'] = 'le'
    header['MRCtype'] = 0
    header['dimensions'] = np.array([0, 0, 0], dtype=int)
    header['dtype'] = 'u1'

    header['compressor'] = None
    header['packedBytes'] = 0
    header['clevel'] = 1

    header['maxImage'] = 1.0
    header['minImage'] = 0.0
    header['meanImage'] = 0.0

    header['pixelsize'] = 0.1
    header['pixelunits'] = u'nm'  # Can be '\\AA' for Angstroms
    header['voltage'] = 300.0  # kV
    header['C3'] = 2.7  # mm
    header['gain'] = 1.0  # counts/electron

    if bloscPresent:
        header['n_threads'] = blosc.detect_number_of_cores()

    return header
Пример #3
0
def load_kernel(filepath, n_threads=None):
    """
    Loads a kernel that was saved using save_kernel().

    Parameters
    ----------
    filepath: str
        The filepath of the saved kernel

    n_threads: int
        The threads to use for decompression. By default, all threads are used.

    Returns
    -------
    ImputationKernel
    """
    n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads
    blosc.set_nthreads(n_threads)
    with open(filepath, "rb") as f:
        kernel = dill.loads(blosc.decompress(dill.load(f)))

    if kernel.original_data_class == "pd_DataFrame":
        kernel.working_data = pd_read_parquet(kernel.working_data)
        for col in kernel.working_data.columns:
            kernel.working_data[col] = kernel.working_data[col].astype(
                kernel.working_dtypes[col]
            )

    return kernel
Пример #4
0
def __MRCZImport(f,
                 header,
                 endian='le',
                 fileConvention='ccpem',
                 returnHeader=False,
                 n_threads=None):
    '''
    Equivalent to MRCImport, but for compressed data using the blosc library.
    
    The following compressors are supported: 
        ``'zlib'``
        ``'zstd'``
        ``'lz4'``
    
    Memory mapping is not possible in this case at present.  
    '''
    if not bloscPresent:
        logger.error('blosc not present, cannot compress files.')
        return

    if n_threads == None:
        blosc.nthreads = blosc.detect_number_of_cores()
    else:
        blosc.nthreads = n_threads

    image = np.empty(header['dimensions'], dtype=header['dtype'])

    blosc_chunk_pos = DEFAULT_HEADER_LEN + header['extendedBytes']
    for J in np.arange(image.shape[0]):
        f.seek(blosc_chunk_pos)
        ((nbytes, blockSize, ctbytes), (ver_info)) = readBloscHeader(f)
        f.seek(blosc_chunk_pos)
        # blosc includes the 16 header bytes in ctbytes
        image[J, :, :] = np.reshape(
            np.frombuffer(blosc.decompress(f.read(ctbytes)),
                          dtype=image.dtype), image.shape[1:])

        blosc_chunk_pos += (ctbytes)
        pass

    if header['MRCtype'] == 101:
        # Seems the 4-bit is interlaced
        interlaced_image = image

        image = np.empty(np.product(header['dimensions']),
                         dtype=header['dtype'])
        # Bit-shift and Bit-and to seperate decimated pixels
        image[0::2] = np.left_shift(interlaced_image, 4) / 15
        image[1::2] = np.right_shift(interlaced_image, 4)

    # We don't need to reshape packed data.
    image = np.squeeze(image)

    return image, header
Пример #5
0
def __MRCZImport( f, header, endian='le', fileConvention = "imod", returnHeader = False, n_threads=None ):
    """
    Equivalent to MRCImport, but for compressed data using the blosc library.
    
    The following compressors are supported: 
        'zlib'
        'zstd'
        'lz4' 
    
    Memory mapping is not possible in this case at present.  
    

    """
    if not bloscPresent:
        print( "ioMRC: blosc not present, cannot compress files." )
        return
        
    if n_threads == None:
        blosc.nthreads = blosc.detect_number_of_cores()
    else:
        blosc.nthreads = n_threads
        
    image = np.empty( header['dimensions'], dtype=header['dtype'] )
    
    # We can read MRC2014 files that don't start at 1024 bytes, but not write them 
    # (as they are non-standard and we don't like breaking stuff)
    blosc_chunk_pos = 1024 + header['extendedBytes']
    for J in np.arange(image.shape[0]):
        f.seek( blosc_chunk_pos )
        ( (nbytes, blockSize, ctbytes ), (ver_info) ) = readBloscHeader(f)
        f.seek(blosc_chunk_pos)
        # blosc includes the 16 header bytes in ctbytes
        image[J,:,:] = np.reshape( 
            np.frombuffer( blosc.decompress( f.read( ctbytes ) ), dtype=image.dtype ),
            image.shape[1:] )
            
        blosc_chunk_pos += (ctbytes)
        pass
    
    
    if header['MRCtype'] == 101:
        # Seems the 4-bit is interlaced 
        interlaced_image = image
            
        image = np.empty( np.product(header['dimensions']), dtype=header['dtype'] )
        # Bit-shift and Bit-and to seperate decimated pixels
        image[0::2] = np.left_shift(interlaced_image,4) / 15
        image[1::2] = np.right_shift(interlaced_image,4)

    # We don't need to reshape packed data.
    image = np.squeeze( image )
    
    return image, header
Пример #6
0
def print_versions():
    """Print all the versions of software that python-blosc relies on."""
    import platform
    print("-=" * 38)
    print("python-blosc version: %s" % blosc.__version__)
    print("Blosc version: %s" % blosc.blosclib_version)
    print("Blosc compressors in this build: %s" % blosc.compressor_list())
    print("Python version: %s" % sys.version)
    (sysname, nodename, release, version, machine, processor) = platform.uname()
    print("Platform: %s-%s-%s (%s)" % (sysname, release, machine, version))
    if sysname == "Linux":
        print("Linux dist: %s" % " ".join(platform.linux_distribution()[:-1]))
    if not processor:
        processor = "not recognized"
    print("Processor: %s" % processor)
    print("Byte-ordering: %s" % sys.byteorder)
    print("Detected cores: %s" % blosc.detect_number_of_cores())
    print("-=" * 38)
Пример #7
0
def set_blosc_nthreads() -> int:
    """set the blosc library to two less than the core count on the system.

    If less than 2 cores are ncores-2, we set the value to two.

    Returns
    -------
    int
        ncores blosc will use on the system
    """
    nCores = blosc.detect_number_of_cores()
    if nCores <= 2:
        nUsed = 1
    elif nCores <= 4:
        nUsed = nCores - 1
    else:
        nUsed = nCores - 2
    blosc.set_nthreads(nUsed)
    return nUsed
Пример #8
0
def print_versions():
    """Print all the versions of software that python-blosc relies on."""
    import platform
    print("-=" * 38)
    print("python-blosc version: %s" % blosc.__version__)
    print("Blosc version: %s" % blosc.blosclib_version)
    print("Blosc compressors in this build: %s" % blosc.compressor_list())
    print("Python version: %s" % sys.version)
    (sysname, nodename, release, version, machine,
     processor) = platform.uname()
    print("Platform: %s-%s-%s (%s)" % (sysname, release, machine, version))
    if sysname == "Linux":
        print("Linux dist: %s" % " ".join(platform.linux_distribution()[:-1]))
    if not processor:
        processor = "not recognized"
    print("Processor: %s" % processor)
    print("Byte-ordering: %s" % sys.byteorder)
    print("Detected cores: %s" % blosc.detect_number_of_cores())
    print("-=" * 38)
Пример #9
0
from . import chunks
from . import hangar_service_pb2
from . import hangar_service_pb2_grpc
from .. import config
from ..context import Environments
from ..context import TxnRegister
from ..hdf5_store import FileHandles
from ..records import commiting
from ..records import hashs
from ..records import heads
from ..records import parsing
from ..records import queries
from ..records import summarize

blosc.set_nthreads(blosc.detect_number_of_cores() - 2)


class HangarClient(object):
    '''Client which connects and handles data transfer to the hangar server.

    Parameters
    ----------
    envs : Environments
        environment handles to manage all required calls to the local
        repostory state.
    address : str
        IP:PORT where the hangar server can be reached.
    '''
    def __init__(self, envs: Environments, address: str):
        self.env = envs
Пример #10
0
except ImportError as e:
    if sys.version_info > (3,0):
        raise ImportError('Get the backport for `concurrent.futures` for Py2.7 as `pip install futures`')
    raise e
from mrcz.__version__ import __version__
from distutils.version import StrictVersion

import logging
logger = logging.getLogger('MRCZ')
try: 
    import blosc
    BLOSC_PRESENT = True
    # For async operations we want to release the GIL in blosc operations and 
    # file IO operations.
    blosc.set_releasegil(True)
    DEFAULT_N_THREADS = blosc.detect_number_of_cores()
except ImportError: 
    # Can be ImportError or ModuleNotFoundError depending on the Python version,
    # but ModuleNotFoundError is a child of ImportError and is still caught.
    BLOSC_PRESENT = False
    logger.info('`blosc` meta-compression library not found, file compression disabled.')
    DEFAULT_N_THREADS = 1
try: 
    import rapidjson as json
except ImportError:
    import json
    logger.info('`python-rapidjson` not found, using builtin `json` instead.')

def _defaultMetaSerialize(value):
    """
    Is called by `json.dumps()` whenever it encounters an object it does 
Пример #11
0
def writeMRC( input_image, MRCfilename, endian='le', dtype=None, 
               pixelsize=[0.1,0.1,0.1], pixelunits=u"\AA", shape=None, 
               voltage = 0.0, C3 = 0.0, gain = 1.0,
               compressor=None, clevel = 1, n_threads=None, quickStats=True ):
    """
    MRCExport( input_image, MRCfilename, endian='le', shape=None, compressor=None, clevel = 1 )
    Created on Thu Apr 02 15:56:34 2015
    @author: Robert A. McLeod
    
    Given a numpy 2-D or 3-D array `input_image` write it has an MRC file `MRCfilename`.
    
        dtype will cast the data before writing it.
        
        pixelsize is [z,y,x] pixel size (singleton values are ok for square/cubic pixels)
        
        pixelunits is "AA" for Angstroms, "pm" for picometers, "\mum" for micrometers, 
        or "nm" for nanometers.  MRC standard is always Angstroms, so pixelsize 
        is converted internally from nm to Angstroms if necessary
        
        shape is only used if you want to later append to the file, such as merging together Relion particles
        for Frealign.  Not recommended and only present for legicacy reasons.
        
        voltage is accelerating potential in keV, defaults to 300.0
        
        C3 is spherical aberration in mm, defaults to 2.7 mm
        
        gain is detector gain (counts/primary electron), defaults to 1.0 (for counting camera)
        
        compressor is a choice of 'lz4', 'zlib', or 'zstd', plus 'blosclz', 'lz4hc'  
        'zstd' generally gives the best compression performance, and is still almost 
           as fast as 'lz4' with clevel = 1
        'zlib' is easiest to decompress with other utilities.
        
        clevel is the compression level, 1 is fastest, 11 is very-slow.  The compression
        ratio will rise slowly with clevel.
        
        n_threads is number of threads to use for blosc compression
        
        quickStats = True estimates the image mean, min, max from the first frame only,
        which saves a lot of computational time for stacks.
    
    Note that MRC definitions are not consistent.  Generally we support the IMOD schema.
    """

    if dtype == 'uint4' and compressor != None:
        raise TypeError( "uint4 packing is not compatible with compression, use int8 datatype." )
        
    header = {}
    if endian == 'le':
        endchar = '<'
    else:
        endchar = '>'
    if dtype == None:
        # TODO: endian support
        header['dtype'] = endchar + input_image.dtype.descr[0][1].strip( "<>|" )
    else:
        header['dtype'] = dtype
        
    # Now we need to filter dtype to make sure it's actually acceptable to MRC
    if not header['dtype'].strip( "<>|" ) in REVERSE_IMOD_ENUM:
        raise TypeError( "ioMRC.MRCExport: Unsupported dtype cast for MRC %s" % header['dtype'] )
        
    header['dimensions'] = input_image.shape
    
    header['pixelsize'] = pixelsize
    header['pixelunits'] = pixelunits
    header['compressor'] = compressor
    header['clevel'] = clevel
    header['shape'] = shape
    
    # This overhead calculation is annoying but many 3rd party tools that use 
    # MRC require these statistical parameters.
    if bool(quickStats) and input_image.ndim == 3:
        header['maxImage'] = np.max( np.real( input_image[0,:,:] ) )
        header['minImage'] = np.min( np.real( input_image[0,:,:] ) )
        header['maxImage'] = np.mean( np.real( input_image[0,:,:] ) )
    else:
        header['maxImage'] = np.max( np.real( input_image ) )
        header['minImage'] = np.min( np.real( input_image ) )
        header['maxImage'] = np.mean( np.real( input_image ) )
    
    header['voltage'] = voltage
    if not bool( header['voltage'] ):
        header['voltage'] = 0.0
    header['C3'] = C3
    if not bool( header['C3'] ):
        header['C3'] = 0.0
    header['gain'] = gain
    if not bool( header['gain'] ):
        header['gain'] = 1.0
    
    header['compressor'] = compressor
    header['clevel'] = clevel
    if n_threads == None and bloscPresent:
        n_threads = blosc.detect_number_of_cores()
    header['n_threads'] = n_threads
    
    # TODO: can we detect the number of cores without adding a heavy dependancy?
    
    if dtype == 'uint4':
        # Decimate to packed 4-bit
        input_image = input_image.astype('uint8')
        input_image = input_image[:,:,::2] + np.left_shift(input_image[:,:,1::2],4)
        
    __MRCExport( input_image, header, MRCfilename, endchar )
Пример #12
0
def writeMRC(input_image,
             MRCfilename,
             meta=None,
             endian='le',
             dtype=None,
             pixelsize=[0.1, 0.1, 0.1],
             pixelunits=u'\\AA',
             shape=None,
             voltage=0.0,
             C3=0.0,
             gain=1.0,
             compressor=None,
             clevel=1,
             n_threads=None,
             quickStats=True,
             idx=None):
    '''
    writeMRC( input_image, MRCfilename, meta=None, idx=None, 
               endian='le', dtype=None, 
               pixelsize=[0.1,0.1,0.1], pixelunits=u'\\AA', shape=None, 
               voltage=0.0, C3=0.0, gain=1.0,
               compressor=None, clevel=1, n_threads=None, quickStats=True, 
               )
    
    Given a ``numpy`` 2-D or 3-D array ``input_image`` write it has an MRC file ``MRCfilename``.

    * ``meta`` is a Python dict{} which will be serialized by JSON and written 
        into the extended header.
    * ``dtype`` will cast the data before writing it.
    * ``pixelsize`` is [z,y,x] pixel size (singleton values are ok for square/cubic pixels)
    * ``pixelunits`` is ``'\\AA'`` for Angstroms, ``'pm'`` for picometers, ``'\mum'`` for micrometers, 
      or ``'nm'`` for nanometers.  MRC standard is always Angstroms, so pixelsize 
      is converted internally from nm to Angstroms if necessary
    * ``shape`` is only used if you want to later append to the file, such as 
      merging together Relion particles for Frealign.  Not recommended and 
      only present for legacy reasons.
    * ``voltage`` is accelerating potential in keV, defaults to 300.0
    * ``C3`` is spherical aberration in mm, defaults to 2.7 mm
    * ``gain`` is detector gain (counts/primary electron), defaults to 1.0 (for counting camera)
    * ``compressor`` is a choice of ``'lz4'``, ``'zlib'``, or ``'zstd'``, plus ``'blosclz'``, ``'lz4hc'`` 
        - ``'lz4'`` is  generally the fastest.
        - ``'zstd'`` generally gives the best compression performance, and is still almost 
        as fast as 'lz4' with clevel = 1
    * ``clevel`` is the compression level, 1 is fastest, 9 is slowest.  The compression
      ratio will rise slowly with clevel.
    * ``n_threads`` is number of threads to use for blosc compression
    * ``quickStats=True`` estimates the image mean, min, max from the first frame only,
      which saves a lot of computational time for stacks.
    * ``idx`` can be used to write an image or set of images starting at a 
      specific position in the MRC file (which may already exist). Index of 
      first image is 0. A negative index can be used to count backwards. If 
      omitted, will write whole stack to file. If writing to an existing 
      file, compression or extended MRC2014 headers are currently not 
      supported with this option.
    
    *Note: MRC definitions are not consistent. Generally we support the CCPEM schema.*
    '''

    if len(input_image.shape) == 2:
        # If it's a 2D image we force it to 3D - this makes life easier later:
        input_image = input_image.reshape(
            (1, input_image.shape[0], input_image.shape[1]))

    # For dask, we don't want to import dask, but we can still work-around how to
    # check its type without isinstance()
    image_type = type(input_image)
    if image_type.__module__ == 'dask.array.core' and image_type.__name__ == 'Array':
        # Ideally it would be faster to iterate over the chunks and pass each one
        # to blosc but that likely requires c-blosc2
        input_image = input_image.__array__()

    # We will need this regardless if writing to an existing file or not:
    if endian == 'le':
        endchar = '<'
    else:
        endchar = '>'

    # We now check if we have to create a new header (i.e. new file) or not. If
    # the file exists, but idx is 'None', it will be replaced by a new file
    # with new header anyway:
    if os.path.isfile(MRCfilename):
        if idx == None:
            idxnewfile = True
        else:
            idxnewfile = False
    else:
        idxnewfile = True

    if idxnewfile:
        if dtype == 'uint4' and compressor != None:
            raise TypeError(
                'uint4 packing is not compatible with compression, use int8 datatype.'
            )

        header = {'meta': meta}
        if dtype == None:
            # TODO: endian support
            header['dtype'] = endchar + input_image.dtype.descr[0][1].strip(
                '<>|')
        else:
            header['dtype'] = dtype

        # Now we need to filter dtype to make sure it's actually acceptable to MRC
        if not header['dtype'].strip('<>|') in REVERSE_CCPEM_ENUM:
            raise TypeError(
                'ioMRC.MRCExport: Unsupported dtype cast for MRC %s' %
                header['dtype'])

        header['dimensions'] = input_image.shape

        header['pixelsize'] = pixelsize
        header['pixelunits'] = pixelunits
        header['compressor'] = compressor
        header['clevel'] = clevel
        header['shape'] = shape

        # This overhead calculation is annoying but many 3rd party tools that use
        # MRC require these statistical parameters.
        if bool(quickStats) and input_image.ndim == 3:
            header['maxImage'] = np.max(np.real(input_image[0, :, :]))
            header['minImage'] = np.min(np.real(input_image[0, :, :]))
            header['meanImage'] = np.mean(np.real(input_image[0, :, :]))
        else:
            header['maxImage'] = np.max(np.real(input_image))
            header['minImage'] = np.min(np.real(input_image))
            header['meanImage'] = np.mean(np.real(input_image))

        header['voltage'] = voltage
        if not bool(header['voltage']):
            header['voltage'] = 0.0
        header['C3'] = C3
        if not bool(header['C3']):
            header['C3'] = 0.0
        header['gain'] = gain
        if not bool(header['gain']):
            header['gain'] = 1.0

        header['compressor'] = compressor
        header['clevel'] = clevel
        if n_threads == None and bloscPresent:
            n_threads = blosc.detect_number_of_cores()
        header['n_threads'] = n_threads

        # TODO: can we detect the number of cores without adding a heavy dependancy?

        if dtype == 'uint4':
            # Decimate to packed 4-bit
            input_image = input_image.astype('uint8')
            input_image = input_image[:, :, ::2] + np.left_shift(
                input_image[:, :, 1::2], 4)

    else:
        # We are going to append to an already existing file:

        # So we try to figure out its header with 'CCPEM' or 'eman2' file conventions:
        try:
            header = readMRCHeader(MRCfilename,
                                   endian,
                                   fileConvention='CCPEM',
                                   pixelunits=pixelunits)

        except ValueError:
            try:
                header = readMRCHeader(MRCfilename,
                                       endian,
                                       fileConvention='eman2',
                                       pixelunits=pixelunits)
            except ValueError:
                # If neither 'CCPEM' nor 'eman2' formats satisfy:
                raise ValueError('Error: unrecognized MRC type for file: %s ' %
                                 MRCfilename)

        # No support for extended headers in arbitrary appending mode:
        # RAM: should work now
        # if header['extendedBytes'] > 0:
        #     raise ValueError( 'Error: MRC2014 files with extended headers not supported for writing: %s = %d' % ('extendedBytes', header['extendedBytes'] ) )

        # If the file already exists, its X,Y dimensions must be consistent with the current image to be written:
        if np.any(header['dimensions'][1:] != input_image.shape[1:]):
            raise ValueError(
                'Error: x,y dimensions of image do not match that of MRC file: %s '
                % MRCfilename)
            # TO DO: check also consistency of dtype?

        if 'meta' not in header.keys():
            header['meta'] = meta

    # Now that we have a proper header, we go into the details of writing to a specific position:
    if idx != None:
        if header['compressor'] != None:
            raise RuntimeError(
                'Writing at arbitrary positions not supported for compressed files. Compressor = %s'
                % header['compressor'])

        idx = int(idx)
        # Force 2D to 3D dimensions:
        if len(header['dimensions']) == 2:
            header['dimensions'] = np.array(
                [1, header['dimensions'][0], header['dimensions'][1]])

        # Convert negative index to equivalent positive index:
        if idx < 0:
            idx = header['dimensions'][0] + idx

        # Just check if the desired image is within the stack range:
        # In principle we could write to a position beyond the limits of the file (missing slots would be filled with zeros), but let's avoid that the user writes a big file with zeros by mistake. So only positions within or immediately consecutive to the stack are allowed:
        if idx < 0 or idx > header['dimensions'][0]:
            raise ValueError(
                'Error: image or slice index out of range. idx = %d, z_dimension = %d'
                % (idx, header['dimensions'][0]))

        # The new Z dimension may be larger than that of the existing file, or even of the new file, if an index larger than the current stack is specified:
        newZ = idx + input_image.shape[0]
        if newZ > header['dimensions'][0]:
            header['dimensions'] = np.array([
                idx + input_image.shape[0], header['dimensions'][1],
                header['dimensions'][2]
            ])

        # This offset will be applied to f.seek():
        offset = idx * np.product(header['dimensions'][1:]) * np.dtype(
            header['dtype']).itemsize

    else:
        offset = 0

    __MRCExport(input_image, header, MRCfilename, endchar, offset, idxnewfile)
Пример #13
0
import pickle
import warnings

import blosc
import numpy

from dpsutil.dataframe.convert import cvt_dec2hex, cvt_hex2dec, cvt_hex2str, cvt_str2hex

COMPRESS_FASTEST = 0
COMPRESS_BEST = 1

blosc.set_nthreads(min(8, max(4, blosc.detect_number_of_cores() // 2)))


def compress(data: bytes, compress_type=COMPRESS_FASTEST) -> bytes:
    """
    compress(data[, compress_type=COMPRESS_FASTEST, nthreads=blosc.ncores, level=None])
    Optionals:
        - compress_type: [COMPRESS_FASTEST, COMPRESS_BEST]
        - nthreads: range 0 -> 256. Default is the number of cores in this system.
        - level: 0-16. If 'level' is None, compress_type will set.
        Higher values will result in better compression at the cost of more CPU usage.

    High speed compress with multi-threading. Implement from blosc.compress
    Raise ValueError if size of buffer larger than 2147483631 bytes.
    """
    assert type(data) is bytes

    compressor = "lz4" if compress_type == COMPRESS_FASTEST else "zstd"
    level = 1 if compress_type == COMPRESS_FASTEST else 5
    return blosc.compress(data, cname=compressor, clevel=level)