예제 #1
0
파일: jsons.py 프로젝트: terliuk/pisa
def to_json(content,
            filename,
            indent=2,
            overwrite=True,
            warn=True,
            sort_keys=False):
    """Write `content` to a JSON file at `filename`.

    Uses a custom parser that automatically converts numpy arrays to lists.

    If `filename` has a ".bz2" extension, the contents will be compressed
    (using bz2 and highest-level of compression, i.e., -9).

    If `filename` has a ".xor" extension, the contents will be xor-scrambled to
    make them human-unreadable (this is useful for, e.g., blind fits).


    Parameters
    ----------
    content : obj
        Object to be written to file. Tries making use of the object's own
        `to_json` method if it exists.

    filename : str
        Name of the file to be written to. Extension has to be 'json' or 'bz2'.

    indent : int
        Pretty-printing. Cf. documentation of json.dump() or json.dumps()

    overwrite : bool
        Set to `True` (default) to allow overwriting existing file. Raise
        exception and quit otherwise.

    warn : bool
        Issue a warning message if a file is being overwritten (`True`,
        default). Suppress warning by setting to `False` (e.g. when overwriting
        is the desired behaviour).

    sort_keys : bool
        Output of dictionaries will be sorted by key if set to `True`.
        Default is `False`. Cf. json.dump() or json.dumps().

    """
    if hasattr(content, 'to_json'):
        return content.to_json(filename,
                               indent=indent,
                               overwrite=overwrite,
                               warn=warn,
                               sort_keys=sort_keys)
    # Import here to avoid circular imports
    from pisa.utils.fileio import check_file_exists
    from pisa.utils.log import logging

    check_file_exists(fname=filename, overwrite=overwrite, warn=warn)

    _, ext = os.path.splitext(filename)
    ext = ext.replace('.', '').lower()
    assert ext == 'json' or ext in ZIP_EXTS + XOR_EXTS

    with open(filename, 'w') as outfile:
        if ext == 'bz2':
            outfile.write(
                bz2.compress(
                    json.dumps(content,
                               outfile,
                               indent=indent,
                               cls=NumpyEncoder,
                               sort_keys=sort_keys,
                               allow_nan=True,
                               ignore_nan=False)))
        elif ext == 'xor':
            # Create tempfile
            temp = tempfile.TemporaryFile(mode='w+b')
            temp.write(
                json.dumps(content,
                           temp,
                           indent=indent,
                           cls=NumpyEncoder,
                           sort_keys=sort_keys,
                           allow_nan=True,
                           ignore_nan=False))
            # Rewind
            temp.seek(0)
            for line in temp:
                # Decrypt with key 42
                line = ''.join([chr(ord(c) ^ 42) for c in line])
                outfile.write(line)
        else:
            json.dump(content,
                      outfile,
                      indent=indent,
                      cls=NumpyEncoder,
                      sort_keys=sort_keys,
                      allow_nan=True,
                      ignore_nan=False)
        logging.debug('Wrote %.2f kB to %s', outfile.tell() / 1024., filename)
예제 #2
0
파일: hdf.py 프로젝트: thehrh/pisa-1
def to_hdf(data_dict, tgt, attrs=None, overwrite=True, warn=True):
    """Store a (possibly nested) dictionary to an HDF5 file or branch node
    within an HDF5 file (an h5py Group).

    This creates hardlinks for duplicate non-trivial leaf nodes (h5py Datasets)
    to minimize storage space required for redundant datasets. Duplication is
    detected via object hashing.

    NOTE: Branch nodes are sorted before storing (by name) for consistency in
    the generated file despite Python dictionaries having no defined ordering
    among keys.

    Parameters
    ----------
    data_dict : Mapping
        Dictionary, OrderedDict, or other Mapping to be stored

    tgt : str or h5py.Group
        Target for storing data. If `tgt` is a str, it is interpreted as a
        filename; a file is created with that name (overwriting an existing
        file, if present). After writing, the file is closed. If `tgt` is an
        h5py.Group, the data is simply written to that Group and it is left
        open at function return.

    attrs : Mapping
        Attributes to apply to the top-level entity being written. See
        http://docs.h5py.org/en/latest/high/attr.html

    overwrite : bool
        Set to `True` (default) to allow overwriting existing file. Raise
        exception and quit otherwise.

    warn : bool
        Issue a warning message if a file is being overwritten. Suppress
        warning by setting to `False` (e.g. when overwriting is the desired
        behaviour).

    """
    if not isinstance(data_dict, Mapping):
        raise TypeError('`data_dict` only accepts top-level'
                        ' dict/OrderedDict/etc.')

    def store_recursively(fhandle, node, path=None, attrs=None,
                          node_hashes=None):
        """Function for iteratively doing the work"""
        path = [] if path is None else path
        full_path = '/' + '/'.join(path)
        node_hashes = OrderedDict() if node_hashes is None else node_hashes

        if attrs is None:
            sorted_attr_keys = []
        else:
            if isinstance(attrs, OrderedDict):
                sorted_attr_keys = attrs.keys()
            else:
                sorted_attr_keys = sorted(attrs.keys())

        if isinstance(node, Mapping):
            logging.trace('  creating Group "%s"', full_path)
            try:
                dset = fhandle.create_group(full_path)
                for key in sorted_attr_keys:
                    dset.attrs[key] = attrs[key]
            except ValueError:
                pass

            for key in sorted(node.keys()):
                if isinstance(key, str):
                    key_str = key
                else:
                    key_str = str(key)
                    logging.warning(
                        'Making string from key "%s", %s for use as'
                        ' name in HDF5 file', key_str, type(key)
                    )
                val = node[key]
                new_path = path + [key_str]
                store_recursively(fhandle=fhandle, node=val, path=new_path,
                                  node_hashes=node_hashes)
        else:
            # Check for existing node
            node_hash = hash_obj(node)
            if node_hash in node_hashes:
                logging.trace('  creating hardlink for Dataset: "%s" -> "%s"',
                              full_path, node_hashes[node_hash])
                # Hardlink the matching existing dataset
                fhandle[full_path] = fhandle[node_hashes[node_hash]]
                return

            # For now, convert None to np.nan since h5py appears to not handle
            # None
            if node is None:
                node = np.nan
                logging.warning(
                    '  encountered `None` at node "%s"; converting to'
                    ' np.nan', full_path
                )

            # "Scalar datasets don't support chunk/filter options". Shuffling
            # is a good idea otherwise since subsequent compression will
            # generally benefit; shuffling requires chunking. Compression is
            # not done here since it is slow, but can be done by
            # post-processing the generated file(s).
            if np.isscalar(node):
                shuffle = False
                chunks = None
            else:
                shuffle = True
                chunks = True
                # Store the node_hash for linking to later if this is more than
                # a scalar datatype. Assumed that "None" has
                node_hashes[node_hash] = full_path

            # -- Handle special types -- #

            # See h5py docs at
            #
            #   https://docs.h5py.org/en/stable/strings.html#how-to-store-text-strings
            #
            # where using `bytes` objects (i.e., in numpy, np.string_) is
            # deemed the most compatible way to encode objects, but apparently
            # we don't have pytables compatibility right now.
            #
            # For boolean support, see
            #
            #   https://docs.h5py.org/en/stable/faq.html#faq

            # TODO: make written hdf5 files compatible with pytables
            # see docs at https://www.pytables.org/usersguide/datatypes.html

            if isinstance(node, string_types):
                node = np.string_(node)
            elif isinstance(node, bool):  # includes np.bool
                node = np.bool_(node)  # same as np.bool8
            elif isinstance(node, np.ndarray):
                if issubclass(node.dtype.type, string_types):
                    node = node.astype(np.string_)
                elif node.dtype.type in (bool, np.bool):
                    node = node.astype(np.bool_)

            logging.trace('  creating dataset at path "%s", hash %s',
                          full_path, node_hash)
            try:
                dset = fhandle.create_dataset(
                    name=full_path, data=node, chunks=chunks, compression=None,
                    shuffle=shuffle, fletcher32=False
                )
            except TypeError:
                try:
                    shuffle = False
                    chunks = None
                    dset = fhandle.create_dataset(
                        name=full_path, data=node, chunks=chunks,
                        compression=None, shuffle=shuffle, fletcher32=False
                    )
                except Exception:
                    logging.error('  full_path: "%s"', full_path)
                    logging.error('  chunks   : %s', str(chunks))
                    logging.error('  shuffle  : %s', str(shuffle))
                    logging.error('  node     : "%s"', str(node))
                    raise

            for key in sorted_attr_keys:
                dset.attrs[key] = attrs[key]

    # Perform the actual operation using the dict passed in by user
    if isinstance(tgt, str):
        from pisa.utils.fileio import check_file_exists
        fpath = check_file_exists(fname=tgt, overwrite=overwrite, warn=warn)
        h5file = h5py.File(fpath, 'w')
        try:
            if attrs is not None:
                h5file.attrs.update(attrs)
            store_recursively(fhandle=h5file, node=data_dict)
        finally:
            h5file.close()

    elif isinstance(tgt, h5py.Group):
        store_recursively(fhandle=tgt, node=data_dict, attrs=attrs)

    else:
        raise TypeError('to_hdf: Invalid `tgt` type: %s' % type(tgt))