def _decode_mixins(tbl): """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into the corresponding table with mixin columns (as appropriate). """ # If available read in __serialized_columns__ meta info which is stored # in FITS COMMENTS between two sentinels. try: i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--') except (ValueError, KeyError): return tbl # The YAML data are split into COMMENT cards, with lines longer than 70 # characters being split with a continuation character \ (backslash). # Strip the backslashes and join together. continuation_line = False lines = [] for line in tbl.meta['comments'][i0 + 1:i1]: if continuation_line: lines[-1] = lines[-1] + line[:70] else: lines.append(line[:70]) continuation_line = len(line) == 71 del tbl.meta['comments'][i0:i1 + 1] if not tbl.meta['comments']: del tbl.meta['comments'] try: info = meta.get_header_from_yaml(lines) except ImportError as exc: if 'PyYAML package is required' in str(exc): warnings.warn( "the file contains information about Astropy native objects " "(mixin columns) that have been serialized when writing it, " "but the PyYAML package is required to read those. Without " "this package some information will be missing in the table", AstropyUserWarning ) return tbl else: raise # Add serialized column information to table meta for use in constructing mixins tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__'] # Use the `datatype` attribute info to update column attributes that are # NOT already handled via standard FITS column keys (name, dtype, unit). for col in info['datatype']: for attr in ['description', 'meta']: if attr in col: setattr(tbl[col['name']].info, attr, col[attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. tbl = serialize._construct_mixins_from_columns(tbl) return tbl
def table_contains_column(root, column): from .samples import JokerSamples path = meta_path(JokerSamples._hdf5_path) header = get_header_from_yaml(h.decode('utf-8') for h in root[path]) columns = [] for row in header['datatype']: columns.append(row['name']) return column in columns
def table_header_to_units(header_dataset): """ Convert a YAML-ized astropy.table header into a dictionary that maps from column name to unit. """ header = get_header_from_yaml(h.decode('utf-8') for h in header_dataset) units = dict() for row in header['datatype']: units[row['name']] = u.Unit(row.get('unit', u.one)) return units
def recursively_read_dict_contents(input): """ Will recursive read a dictionary, initializing quantities and table from a dictionary read from an hdf5 file. Parameters ---------- input : dict dictionary read from hdf5 Returns -------- dict Dictionary we want to use """ new_keys = [k for k in input.keys()] # if all(elem in new_keys for elem in ['wl_grid', 'data', 'time_grid']): # wl = input['wl_grid']['value'] * u.Unit(input['wl_grid']['unit']) # data = input['data']['value'] * u.Unit(input['data']['unit']) # time = input['time_grid']['value'] * u.Unit(input['time_grid']['unit']) # input = signal[str(input['data']['unit'])](wl, data, time) if all(elem in new_keys for elem in ['value', 'unit']): input['value'] = input['value'] * u.Unit(input['unit']) if any('.__table_column_meta__' in elem for elem in new_keys): table_keys = [ elem for elem in new_keys if '.__table_column_meta__' in elem ] table_keys = (elem.split('.')[0] for elem in table_keys) for k in table_keys: table = Table(np.array(input[k])) header = meta.get_header_from_yaml( h.decode('utf-8') for h in input['{}.__table_column_meta__'.format(k)]) header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) table = serialize._construct_mixins_from_columns(table) try: header['meta'].pop('__serialized_columns__') table.meta = header['meta'] except KeyError: pass input[k] = table for key in new_keys: if isinstance(input[key], dict): input[key] = recursively_read_dict_contents(input[key]) return input
def load_table(input_table, k): table = Table(np.array(input_table[k])) header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_table['{}.__table_column_meta__'.format(k)]) header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) table = serialize._construct_mixins_from_columns(table) try: header['meta'].pop('__serialized_columns__') table.meta = header['meta'] except KeyError: pass return table
def _read_tables(cls, group, path=None): if path is None: path = cls._hdf5_path samples = group[f'{path}'] metadata = group[f'{path}.__table_column_meta__'] header = meta.get_header_from_yaml( h.decode('utf-8') for h in metadata.read()) table = Table(np.array(samples.read())) if 'meta' in list(header.keys()): table.meta = header['meta'] table = serialize._construct_mixins_from_columns(table) return cls(table)
def _decode_mixins(tbl): """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into the corresponding table with mixin columns (as appropriate). """ # If available read in __serialized_columns__ meta info which is stored # in FITS COMMENTS between two sentinels. try: i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--') except (ValueError, KeyError): return tbl # The YAML data are split into COMMENT cards, with lines longer than 70 # characters being split with a continuation character \ (backslash). # Strip the backslashes and join together. continuation_line = False lines = [] for line in tbl.meta['comments'][i0 + 1:i1]: if continuation_line: lines[-1] = lines[-1] + line[:70] else: lines.append(line[:70]) continuation_line = len(line) == 71 del tbl.meta['comments'][i0:i1 + 1] if not tbl.meta['comments']: del tbl.meta['comments'] info = meta.get_header_from_yaml(lines) # Add serialized column information to table meta for use in constructing mixins tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__'] # Use the `datatype` attribute info to update column attributes that are # NOT already handled via standard FITS column keys (name, dtype, unit). for col in info['datatype']: for attr in ['description', 'meta']: if attr in col: setattr(tbl[col['name']].info, attr, col[attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. tbl = serialize._construct_mixins_from_columns(tbl) return tbl
def get_cols(self, lines): """ Initialize the header Column objects from the table ``lines``. Parameters ---------- lines : list List of table lines """ # Cache a copy of the original input lines before processing below raw_lines = lines # Extract non-blank comment (header) lines with comment character stripped lines = list(self.process_lines(lines)) # Validate that this is a ECSV file ecsv_header_re = r"""%ECSV [ ] (?P<major> \d+) \. (?P<minor> \d+) \.? (?P<bugfix> \d+)? $""" no_header_msg = ('ECSV header line like "# %ECSV <version>" not found as first line.' ' This is required for a ECSV file.') if not lines: raise core.InconsistentTableError(no_header_msg) match = re.match(ecsv_header_re, lines[0].strip(), re.VERBOSE) if not match: raise core.InconsistentTableError(no_header_msg) # ecsv_version could be constructed here, but it is not currently used. try: header = meta.get_header_from_yaml(lines) except ImportError as exc: if 'PyYAML package is required' in str(exc): warnings.warn("file looks like ECSV format but PyYAML is not installed " "so it cannot be parsed as ECSV", AstropyWarning) raise core.InconsistentTableError('unable to parse yaml in meta header' ' (PyYAML package is required)') except meta.YamlParseError: raise core.InconsistentTableError('unable to parse yaml in meta header') if 'meta' in header: self.table_meta = header['meta'] if 'delimiter' in header: delimiter = header['delimiter'] if delimiter not in DELIMITERS: raise ValueError('only space and comma are allowed for delimiter in ECSV format') self.splitter.delimiter = delimiter self.data.splitter.delimiter = delimiter # Create the list of io.ascii column objects from `header` header_cols = OrderedDict((x['name'], x) for x in header['datatype']) self.names = [x['name'] for x in header['datatype']] # Read the first non-commented line of table and split to get the CSV # header column names. This is essentially what the Basic reader does. header_line = next(super().process_lines(raw_lines)) header_names = next(self.splitter([header_line])) # Check for consistency of the ECSV vs. CSV header column names if header_names != self.names: raise core.InconsistentTableError('column names from ECSV header {} do not ' 'match names from header line of CSV data {}' .format(self.names, header_names)) # BaseHeader method to create self.cols, which is a list of # io.ascii.core.Column objects (*not* Table Column objects). self._set_cols_from_names() # Transfer attributes from the column descriptor stored in the input # header YAML metadata to the new columns to create this table. for col in self.cols: for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) col.dtype = header_cols[col.name]['datatype'] # ECSV "string" means numpy dtype.kind == 'U' AKA str in Python 3 if col.dtype == 'string': col.dtype = 'str' if col.dtype.startswith('complex'): raise TypeError('ecsv reader does not support complex number types')
def get_cols(self, lines): """ READ: Initialize the header Column objects from the table ``lines``. Parameters ---------- lines : list List of table lines """ # Cache a copy of the original input lines before processing below raw_lines = lines # Extract non-blank comment (header) lines with comment character stripped lines = list(self.process_lines(lines)) # Validate that this is a ECSV file ecsv_header_re = r"""%ECSV [ ] (?P<major> \d+) \. (?P<minor> \d+) \.? (?P<bugfix> \d+)? $""" no_header_msg = ( 'ECSV header line like "# %ECSV <version>" not found as first line.' ' This is required for a ECSV file.') if not lines: raise core.InconsistentTableError(no_header_msg) match = re.match(ecsv_header_re, lines[0].strip(), re.VERBOSE) if not match: raise core.InconsistentTableError(no_header_msg) # ecsv_version could be constructed here, but it is not currently used. try: header = meta.get_header_from_yaml(lines) except ImportError as exc: if 'PyYAML package is required' in str(exc): warnings.warn( "file looks like ECSV format but PyYAML is not installed " "so it cannot be parsed as ECSV", AstropyWarning) raise core.InconsistentTableError( 'unable to parse yaml in meta header' ' (PyYAML package is required)') except meta.YamlParseError: raise core.InconsistentTableError( 'unable to parse yaml in meta header') if 'meta' in header: self.table_meta = header['meta'] if 'delimiter' in header: delimiter = header['delimiter'] if delimiter not in DELIMITERS: raise ValueError( 'only space and comma are allowed for delimiter in ECSV format' ) self.splitter.delimiter = delimiter self.data.splitter.delimiter = delimiter # Create the list of io.ascii column objects from `header` header_cols = OrderedDict((x['name'], x) for x in header['datatype']) self.names = [x['name'] for x in header['datatype']] # Read the first non-commented line of table and split to get the CSV # header column names. This is essentially what the Basic reader does. header_line = next(super().process_lines(raw_lines)) header_names = next(self.splitter([header_line])) # Check for consistency of the ECSV vs. CSV header column names if header_names != self.names: raise core.InconsistentTableError( 'column names from ECSV header {} do not ' 'match names from header line of CSV data {}'.format( self.names, header_names)) # BaseHeader method to create self.cols, which is a list of # io.ascii.core.Column objects (*not* Table Column objects). self._set_cols_from_names() # Transfer attributes from the column descriptor stored in the input # header YAML metadata to the new columns to create this table. for col in self.cols: for attr in ('description', 'format', 'unit', 'meta', 'subtype'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) col.dtype = header_cols[col.name]['datatype'] if col.dtype not in ECSV_DATATYPES: raise ValueError( f'datatype {col.dtype!r} of column {col.name!r} ' f'is not in allowed values {ECSV_DATATYPES}') # Subtype is written like "int64[2,null]" and we want to split this # out to "int64" and [2, None]. subtype = col.subtype if subtype and '[' in subtype: idx = subtype.index('[') col.subtype = subtype[:idx] col.shape = json.loads(subtype[idx:]) # Convert ECSV "string" to numpy "str" for attr in ('dtype', 'subtype'): if getattr(col, attr) == 'string': setattr(col, attr, 'str') # ECSV subtype of 'json' maps to numpy 'object' dtype if col.subtype == 'json': col.subtype = 'object'
def read_table_hdf5(input, path=None, character_as_bytes=True): """ Read a Table object from an HDF5 file This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one table is present in the HDF5 file or group, the first table is read in and a warning is displayed. Parameters ---------- input : str or :class:`h5py.File` or :class:`h5py.Group` or :class:`h5py.Dataset` If a string, the filename to read the table from. If an h5py object, either the file or the group object to read the table from. path : str The path from which to read the table inside the HDF5 file. This should be relative to the input file or group. character_as_bytes : bool If `True` then Table columns are left as bytes. If `False` then Table columns are converted to unicode. """ try: import h5py except ImportError: raise Exception("h5py is required to read and write HDF5 files") # This function is iterative, and only gets to writing the file when # the input is an hdf5 Group. Moreover, the input variable is changed in # place. # Here, we save its value to be used at the end when the conditions are # right. input_save = input if isinstance(input, (h5py.File, h5py.Group)): # If a path was specified, follow the path if path is not None: try: input = input[path] except (KeyError, ValueError): raise OSError(f"Path {path} does not exist") # `input` is now either a group or a dataset. If it is a group, we # will search for all structured arrays inside the group, and if there # is one we can proceed otherwise an error is raised. If it is a # dataset, we just proceed with the reading. if isinstance(input, h5py.Group): # Find all structured arrays in group arrays = _find_all_structured_arrays(input) if len(arrays) == 0: raise ValueError(f"no table found in HDF5 group {path}") elif len(arrays) > 0: path = arrays[0] if path is None else path + '/' + arrays[0] if len(arrays) > 1: warnings.warn( "path= was not specified but multiple tables" " are present, reading in first available" " table (path={})".format(path), AstropyUserWarning) return read_table_hdf5(input, path=path) elif not isinstance(input, h5py.Dataset): # If a file object was passed, then we need to extract the filename # because h5py cannot properly read in file objects. if hasattr(input, 'read'): try: input = input.name except AttributeError: raise TypeError("h5py can only open regular files") # Open the file for reading, and recursively call read_table_hdf5 with # the file object and the path. f = h5py.File(input, 'r') try: return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes) finally: f.close() # If we are here, `input` should be a Dataset object, which we can now # convert to a Table. # Create a Table object from astropy.table import Table, meta, serialize table = Table(np.array(input)) # Read the meta-data from the file. For back-compatibility, we can read # the old file format where the serialized metadata were saved in the # attributes of the HDF5 dataset. # In the new format, instead, metadata are stored in a new dataset in the # same file. This is introduced in Astropy 3.0 old_version_meta = META_KEY in input.attrs new_version_meta = path is not None and meta_path(path) in input_save if old_version_meta or new_version_meta: if new_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_save[meta_path(path)]) else: # Must be old_version_meta is True. if (A or B) and not A then B is True header = meta.get_header_from_yaml( h.decode('utf-8') for h in input.attrs[META_KEY]) if 'meta' in list(header.keys()): table.meta = header['meta'] header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. table = serialize._construct_mixins_from_columns(table) else: # Read the meta-data from the file table.meta.update(input.attrs) if not character_as_bytes: table.convert_bytestring_to_unicode() return table
def read_table_hdf5(input, path=None, character_as_bytes=True): """ Read a Table object from an HDF5 file This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one table is present in the HDF5 file or group, the first table is read in and a warning is displayed. Parameters ---------- input : str or :class:`h5py:File` or :class:`h5py:Group` or :class:`h5py:Dataset` If a string, the filename to read the table from. If an h5py object, either the file or the group object to read the table from. path : str The path from which to read the table inside the HDF5 file. This should be relative to the input file or group. character_as_bytes: boolean If `True` then Table columns are left as bytes. If `False` then Table columns are converted to unicode. """ try: import h5py except ImportError: raise Exception("h5py is required to read and write HDF5 files") # This function is iterative, and only gets to writing the file when # the input is an hdf5 Group. Moreover, the input variable is changed in # place. # Here, we save its value to be used at the end when the conditions are # right. input_save = input if isinstance(input, (h5py.File, h5py.Group)): # If a path was specified, follow the path if path is not None: try: input = input[path] except (KeyError, ValueError): raise OSError("Path {0} does not exist".format(path)) # `input` is now either a group or a dataset. If it is a group, we # will search for all structured arrays inside the group, and if there # is one we can proceed otherwise an error is raised. If it is a # dataset, we just proceed with the reading. if isinstance(input, h5py.Group): # Find all structured arrays in group arrays = _find_all_structured_arrays(input) if len(arrays) == 0: raise ValueError("no table found in HDF5 group {0}". format(path)) elif len(arrays) > 0: path = arrays[0] if path is None else path + '/' + arrays[0] if len(arrays) > 1: warnings.warn("path= was not specified but multiple tables" " are present, reading in first available" " table (path={0})".format(path), AstropyUserWarning) return read_table_hdf5(input, path=path) elif not isinstance(input, h5py.Dataset): # If a file object was passed, then we need to extract the filename # because h5py cannot properly read in file objects. if hasattr(input, 'read'): try: input = input.name except AttributeError: raise TypeError("h5py can only open regular files") # Open the file for reading, and recursively call read_table_hdf5 with # the file object and the path. f = h5py.File(input, 'r') try: return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes) finally: f.close() # If we are here, `input` should be a Dataset object, which we can now # convert to a Table. # Create a Table object from astropy.table import Table, meta, serialize table = Table(np.array(input)) # Read the meta-data from the file. For back-compatibility, we can read # the old file format where the serialized metadata were saved in the # attributes of the HDF5 dataset. # In the new format, instead, metadata are stored in a new dataset in the # same file. This is introduced in Astropy 3.0 old_version_meta = META_KEY in input.attrs new_version_meta = path is not None and meta_path(path) in input_save if old_version_meta or new_version_meta: if new_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_save[meta_path(path)]) elif old_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input.attrs[META_KEY]) if 'meta' in list(header.keys()): table.meta = header['meta'] header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. table = serialize._construct_mixins_from_columns(table) else: # Read the meta-data from the file table.meta.update(input.attrs) if not character_as_bytes: table.convert_bytestring_to_unicode() return table
def read_table_parquet(input, include_names=None, exclude_names=None, schema_only=False, filters=None): """ Read a Table object from a Parquet file. This requires `pyarrow <https://arrow.apache.org/docs/python/>`_ to be installed. The ``filters`` parameter consists of predicates that are expressed in disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary boolean logical combinations of single column predicates. The innermost tuples each describe a single column predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple column predicate. Finally, the most outer list combines these filters as a disjunction (OR). Predicates may also be passed as List[Tuple]. This form is interpreted as a single conjunction. To express OR in predicates, one must use the (preferred) List[List[Tuple]] notation. Each tuple has format: (``key``, ``op``, ``value``) and compares the ``key`` with the ``value``. The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the ``value`` must be a collection such as a ``list``, a ``set`` or a ``tuple``. Examples: .. code-block:: python ('x', '=', 0) ('y', 'in', ['a', 'b', 'c']) ('z', 'not in', {'a','b'}) Parameters ---------- input : str or path-like or file-like object If a string or path-like object, the filename to read the table from. If a file-like object, the stream to read data. include_names : list [str], optional List of names to include in output. If not supplied, then include all columns. exclude_names : list [str], optional List of names to exclude from output (applied after ``include_names``). If not supplied then no columns are excluded. schema_only : bool, optional Only read the schema/metadata with table information. filters : list [tuple] or list [list [tuple] ] or None, optional Rows which do not match the filter predicate will be removed from scanned data. See `pyarrow.parquet.read_table()` for details. Returns ------- table : `~astropy.table.Table` Table will have zero rows and only metadata information if schema_only is True. """ pa, parquet, _ = get_pyarrow() if not isinstance(input, (str, os.PathLike)): # The 'read' attribute is the key component of a generic # file-like object. if not hasattr(input, 'read'): raise TypeError( "pyarrow can only open path-like or file-like objects.") schema = parquet.read_schema(input) # Pyarrow stores all metadata as byte-strings, so we convert # to UTF-8 strings here. if schema.metadata is not None: md = { k.decode('UTF-8'): v.decode('UTF-8') for k, v in schema.metadata.items() } else: md = {} from astropy.table import Table, meta, serialize # parse metadata from table yaml meta_dict = {} if 'table_meta_yaml' in md: meta_yaml = md.pop('table_meta_yaml').split('\n') meta_hdr = meta.get_header_from_yaml(meta_yaml) if 'meta' in meta_hdr: meta_dict = meta_hdr['meta'] else: meta_hdr = None # parse and set serialized columns full_table_columns = {name: name for name in schema.names} has_serialized_columns = False if '__serialized_columns__' in meta_dict: has_serialized_columns = True serialized_columns = meta_dict['__serialized_columns__'] for scol in serialized_columns: for name in _get_names(serialized_columns[scol]): full_table_columns[name] = scol use_names = set(full_table_columns.values()) # Apply include_names before exclude_names if include_names is not None: use_names.intersection_update(include_names) if exclude_names is not None: use_names.difference_update(exclude_names) # Preserve column ordering via list, and use this dict trick # to remove duplicates and preserve ordering (for mixin columns) use_names = list( dict.fromkeys( [x for x in full_table_columns.values() if x in use_names])) # names_to_read is a list of actual serialized column names, where # e.g. the requested name 'time' becomes ['time.jd1', 'time.jd2'] names_to_read = [] for name in use_names: names = [n for n, col in full_table_columns.items() if name == col] names_to_read.extend(names) if not names_to_read: raise ValueError("No include_names specified were found in the table.") # We need to pop any unread serialized columns out of the meta_dict. if has_serialized_columns: for scol in list(meta_dict['__serialized_columns__'].keys()): if scol not in use_names: meta_dict['__serialized_columns__'].pop(scol) # whether to return the whole table or a formatted empty table. if not schema_only: # Read the pyarrow table, specifying columns and filters. pa_table = parquet.read_table(input, columns=names_to_read, filters=filters) num_rows = pa_table.num_rows else: num_rows = 0 # Now need to convert parquet table to Astropy dtype = [] for name in names_to_read: # Pyarrow string and byte columns do not have native length information # so we must determine those here. if schema.field(name).type not in (pa.string(), pa.binary()): # Convert the pyarrow type into a numpy dtype (which is returned # by the to_pandas_type() method). dtype.append(schema.field(name).type.to_pandas_dtype()) continue # Special-case for string and binary columns md_name = f'table::len::{name}' if md_name in md: # String/bytes length from header. strlen = int(md[md_name]) elif schema_only: # Find the maximum string length. # Choose an arbitrary string length since # are not reading in the table. strlen = 10 warnings.warn( f"No {md_name} found in metadata. " f"Guessing {{strlen}} for schema.", AstropyUserWarning) else: strlen = max([len(row.as_py()) for row in pa_table[name]]) warnings.warn( f"No {md_name} found in metadata. " f"Using longest string ({{strlen}} characters).", AstropyUserWarning) dtype.append(f'U{strlen}' if schema.field(name).type == pa.string() else f'|S{strlen}') # Create the empty numpy record array to store the pyarrow data. data = np.zeros(num_rows, dtype=list(zip(names_to_read, dtype))) if not schema_only: # Convert each column in the pyarrow table to a numpy array for name in names_to_read: data[name][:] = pa_table[name].to_numpy() table = Table(data=data, meta=meta_dict) if meta_hdr is not None: # Set description, format, unit, meta from the column # metadata that was serialized with the table. header_cols = dict((x['name'], x) for x in meta_hdr['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Convert all compound columns to astropy objects # (e.g. time.jd1, time.jd2 into a single time column) table = serialize._construct_mixins_from_columns(table) return table
def write_table_hdf5(table, output, path=None, compression=False, append=False, overwrite=False, serialize_meta=False, metadata_conflicts='error', **create_dataset_kwargs): """ Write a Table object to an HDF5 file This requires `h5py <http://www.h5py.org/>`_ to be installed. Parameters ---------- table : `~astropy.table.Table` Data table that is to be written to file. output : str or :class:`h5py:File` or :class:`h5py:Group` If a string, the filename to write the table to. If an h5py object, either the file or the group object to write the table to. path : str The path to which to write the table inside the HDF5 file. This should be relative to the input file or group. If not specified, defaults to ``__astropy_table__``. compression : bool or str or int Whether to compress the table inside the HDF5 file. If set to `True`, ``'gzip'`` compression is used. If a string is specified, it should be one of ``'gzip'``, ``'szip'``, or ``'lzf'``. If an integer is specified (in the range 0-9), ``'gzip'`` compression is used, and the integer denotes the compression level. append : bool Whether to append the table to an existing HDF5 file. overwrite : bool Whether to overwrite any existing file without warning. If ``append=True`` and ``overwrite=True`` then only the dataset will be replaced; the file/group will not be overwritten. metadata_conflicts : str How to proceed with metadata conflicts. This should be one of: * ``'silent'``: silently pick the last conflicting meta-data value * ``'warn'``: pick the last conflicting meta-data value, but emit a warning (default) * ``'error'``: raise an exception. **create_dataset_kwargs Additional keyword arguments are passed to `h5py.File.create_dataset`. """ from astropy.table import meta try: import h5py except ImportError: raise Exception("h5py is required to read and write HDF5 files") if path is None: # table is just an arbitrary, hardcoded string here. path = '__astropy_table__' elif path.endswith('/'): raise ValueError("table path should end with table name, not /") if '/' in path: group, name = path.rsplit('/', 1) else: group, name = None, path if isinstance(output, (h5py.File, h5py.Group)): if len(list(output.keys())) > 0 and name == '__astropy_table__': raise ValueError("table path should always be set via the " "path= argument when writing to existing " "files") elif name == '__astropy_table__': warnings.warn("table path was not set via the path= argument; " "using default path {}".format(path)) if group: try: output_group = output[group] except (KeyError, ValueError): output_group = output.create_group(group) else: output_group = output elif isinstance(output, str): if os.path.exists(output) and not append: if overwrite and not append: os.remove(output) else: raise OSError(f"File exists: {output}") # Open the file for appending or writing f = h5py.File(output, 'a' if append else 'w') # Recursively call the write function try: return write_table_hdf5(table, f, path=path, compression=compression, append=append, overwrite=overwrite, serialize_meta=serialize_meta, **create_dataset_kwargs) finally: f.close() else: raise TypeError('output should be a string or an h5py File or ' 'Group object') # Check whether table already exists existing_header = None if name in output_group: if append and overwrite: # Delete only the dataset itself del output_group[name] elif append: # Data table exists, so we interpret "append" to mean "extend # existing table with the table passed in". However, this requires # the table to have been written by this function in the past, so it # should have a metadata header if meta_path(name) not in output_group: raise ValueError("No metadata exists for existing table. We " "can only append tables if metadata " "is consistent for all tables") # Load existing table header: existing_header = get_header_from_yaml( h.decode('utf-8') for h in output_group[meta_path(name)]) else: raise OSError(f"Table {path} already exists") # Encode any mixin columns as plain columns + appropriate metadata table = _encode_mixins(table) # Table with numpy unicode strings can't be written in HDF5 so # to write such a table a copy of table is made containing columns as # bytestrings. Now this copy of the table can be written in HDF5. if any(col.info.dtype.kind == 'U' for col in table.itercols()): table = table.copy(copy_data=False) table.convert_unicode_to_bytestring() # Warn if information will be lost when serialize_meta=False. This is # hardcoded to the set difference between column info attributes and what # HDF5 can store natively (name, dtype) with no meta. if serialize_meta is False: for col in table.itercols(): for attr in ('unit', 'format', 'description', 'meta'): if getattr(col.info, attr, None) not in (None, {}): warnings.warn( "table contains column(s) with defined 'unit', 'format'," " 'description', or 'meta' info attributes. These will" " be dropped since serialize_meta=False.", AstropyUserWarning) if existing_header is None: # Just write the table and metadata # Write the table to the file if compression: if compression is True: compression = 'gzip' dset = output_group.create_dataset(name, data=table.as_array(), compression=compression, **create_dataset_kwargs) else: dset = output_group.create_dataset(name, data=table.as_array(), **create_dataset_kwargs) if serialize_meta: header_yaml = meta.get_yaml_from_table(table) header_encoded = [h.encode('utf-8') for h in header_yaml] output_group.create_dataset(meta_path(name), data=header_encoded) else: # Write the Table meta dict key:value pairs to the file as HDF5 # attributes. This works only for a limited set of scalar data types # like numbers, strings, etc., but not any complex types. This path # also ignores column meta like unit or format. for key in table.meta: val = table.meta[key] try: dset.attrs[key] = val except TypeError: warnings.warn( "Attribute `{}` of type {} cannot be written to " "HDF5 files - skipping. (Consider specifying " "serialize_meta=True to write all meta data)".format( key, type(val)), AstropyUserWarning) else: # We need to append the tables! try: # FIXME: do something with the merged metadata! metadata.merge(existing_header['meta'], table.meta, metadata_conflicts=metadata_conflicts) except metadata.MergeConflictError: raise metadata.MergeConflictError( "Cannot append table to existing file because " "the existing file table metadata and this " "table object's metadata do not match. If you " "want to ignore this issue, or change to a " "warning, set metadata_conflicts='silent' or 'warn'.") # Now compare datatype of this object and on disk this_header = get_header_from_yaml(get_yaml_from_table(table)) if not _custom_tbl_dtype_compare(existing_header['datatype'], this_header['datatype']): raise ValueError( "Cannot append table to existing file because " "the existing file table datatype and this " "object's table datatype do not match. " f"{existing_header['datatype']} vs. {this_header['datatype']}") # If we got here, we can now try to append: current_size = len(output_group[name]) output_group[name].resize((current_size + len(table), )) output_group[name][current_size:] = table.as_array()