def _decode_mixins(tbl): """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into the corresponding table with mixin columns (as appropriate). """ # If available read in __serialized_columns__ meta info which is stored # in FITS COMMENTS between two sentinels. try: i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--') except (ValueError, KeyError): return tbl # The YAML data are split into COMMENT cards, with lines longer than 70 # characters being split with a continuation character \ (backslash). # Strip the backslashes and join together. continuation_line = False lines = [] for line in tbl.meta['comments'][i0 + 1:i1]: if continuation_line: lines[-1] = lines[-1] + line[:70] else: lines.append(line[:70]) continuation_line = len(line) == 71 del tbl.meta['comments'][i0:i1 + 1] if not tbl.meta['comments']: del tbl.meta['comments'] try: info = meta.get_header_from_yaml(lines) except ImportError as exc: if 'PyYAML package is required' in str(exc): warnings.warn( "the file contains information about Astropy native objects " "(mixin columns) that have been serialized when writing it, " "but the PyYAML package is required to read those. Without " "this package some information will be missing in the table", AstropyUserWarning ) return tbl else: raise # Add serialized column information to table meta for use in constructing mixins tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__'] # Use the `datatype` attribute info to update column attributes that are # NOT already handled via standard FITS column keys (name, dtype, unit). for col in info['datatype']: for attr in ['description', 'meta']: if attr in col: setattr(tbl[col['name']].info, attr, col[attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. tbl = serialize._construct_mixins_from_columns(tbl) return tbl
def __call__(self, cols, meta): # Convert to a Table with all plain Column subclass columns out = super().__call__(cols, meta) # If mixin columns exist (based on the special '__mixin_columns__' # key in the table ``meta``), then use that information to construct # appropriate mixin columns and remove the original data columns. # If no __mixin_columns__ exists then this function just passes back # the input table. out = serialize._construct_mixins_from_columns(out) return out
def __call__(self, cols, meta): # Convert to a Table with all plain Column subclass columns out = super().__call__(cols, meta) # If mixin columns exist (based on the special '__mixin_columns__' # key in the table ``meta``), then use that information to construct # appropriate mixin columns and remove the original data columns. # If no __mixin_columns__ exists then this function just passes back # the input table. out = serialize._construct_mixins_from_columns(out) return out
def recursively_read_dict_contents(input): """ Will recursive read a dictionary, initializing quantities and table from a dictionary read from an hdf5 file. Parameters ---------- input : dict dictionary read from hdf5 Returns -------- dict Dictionary we want to use """ new_keys = [k for k in input.keys()] # if all(elem in new_keys for elem in ['wl_grid', 'data', 'time_grid']): # wl = input['wl_grid']['value'] * u.Unit(input['wl_grid']['unit']) # data = input['data']['value'] * u.Unit(input['data']['unit']) # time = input['time_grid']['value'] * u.Unit(input['time_grid']['unit']) # input = signal[str(input['data']['unit'])](wl, data, time) if all(elem in new_keys for elem in ['value', 'unit']): input['value'] = input['value'] * u.Unit(input['unit']) if any('.__table_column_meta__' in elem for elem in new_keys): table_keys = [ elem for elem in new_keys if '.__table_column_meta__' in elem ] table_keys = (elem.split('.')[0] for elem in table_keys) for k in table_keys: table = Table(np.array(input[k])) header = meta.get_header_from_yaml( h.decode('utf-8') for h in input['{}.__table_column_meta__'.format(k)]) header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) table = serialize._construct_mixins_from_columns(table) try: header['meta'].pop('__serialized_columns__') table.meta = header['meta'] except KeyError: pass input[k] = table for key in new_keys: if isinstance(input[key], dict): input[key] = recursively_read_dict_contents(input[key]) return input
def load_table(input_table, k): table = Table(np.array(input_table[k])) header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_table['{}.__table_column_meta__'.format(k)]) header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) table = serialize._construct_mixins_from_columns(table) try: header['meta'].pop('__serialized_columns__') table.meta = header['meta'] except KeyError: pass return table
def _read_tables(cls, group, path=None): if path is None: path = cls._hdf5_path samples = group[f'{path}'] metadata = group[f'{path}.__table_column_meta__'] header = meta.get_header_from_yaml( h.decode('utf-8') for h in metadata.read()) table = Table(np.array(samples.read())) if 'meta' in list(header.keys()): table.meta = header['meta'] table = serialize._construct_mixins_from_columns(table) return cls(table)
def _decode_mixins(tbl): """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into the corresponding table with mixin columns (as appropriate). """ # If available read in __serialized_columns__ meta info which is stored # in FITS COMMENTS between two sentinels. try: i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--') except (ValueError, KeyError): return tbl # The YAML data are split into COMMENT cards, with lines longer than 70 # characters being split with a continuation character \ (backslash). # Strip the backslashes and join together. continuation_line = False lines = [] for line in tbl.meta['comments'][i0 + 1:i1]: if continuation_line: lines[-1] = lines[-1] + line[:70] else: lines.append(line[:70]) continuation_line = len(line) == 71 del tbl.meta['comments'][i0:i1 + 1] if not tbl.meta['comments']: del tbl.meta['comments'] info = meta.get_header_from_yaml(lines) # Add serialized column information to table meta for use in constructing mixins tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__'] # Use the `datatype` attribute info to update column attributes that are # NOT already handled via standard FITS column keys (name, dtype, unit). for col in info['datatype']: for attr in ['description', 'meta']: if attr in col: setattr(tbl[col['name']].info, attr, col[attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. tbl = serialize._construct_mixins_from_columns(tbl) return tbl
def read_table_hdf5(input, path=None, character_as_bytes=True): """ Read a Table object from an HDF5 file This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one table is present in the HDF5 file or group, the first table is read in and a warning is displayed. Parameters ---------- input : str or :class:`h5py.File` or :class:`h5py.Group` or :class:`h5py.Dataset` If a string, the filename to read the table from. If an h5py object, either the file or the group object to read the table from. path : str The path from which to read the table inside the HDF5 file. This should be relative to the input file or group. character_as_bytes : bool If `True` then Table columns are left as bytes. If `False` then Table columns are converted to unicode. """ try: import h5py except ImportError: raise Exception("h5py is required to read and write HDF5 files") # This function is iterative, and only gets to writing the file when # the input is an hdf5 Group. Moreover, the input variable is changed in # place. # Here, we save its value to be used at the end when the conditions are # right. input_save = input if isinstance(input, (h5py.File, h5py.Group)): # If a path was specified, follow the path if path is not None: try: input = input[path] except (KeyError, ValueError): raise OSError(f"Path {path} does not exist") # `input` is now either a group or a dataset. If it is a group, we # will search for all structured arrays inside the group, and if there # is one we can proceed otherwise an error is raised. If it is a # dataset, we just proceed with the reading. if isinstance(input, h5py.Group): # Find all structured arrays in group arrays = _find_all_structured_arrays(input) if len(arrays) == 0: raise ValueError(f"no table found in HDF5 group {path}") elif len(arrays) > 0: path = arrays[0] if path is None else path + '/' + arrays[0] if len(arrays) > 1: warnings.warn( "path= was not specified but multiple tables" " are present, reading in first available" " table (path={})".format(path), AstropyUserWarning) return read_table_hdf5(input, path=path) elif not isinstance(input, h5py.Dataset): # If a file object was passed, then we need to extract the filename # because h5py cannot properly read in file objects. if hasattr(input, 'read'): try: input = input.name except AttributeError: raise TypeError("h5py can only open regular files") # Open the file for reading, and recursively call read_table_hdf5 with # the file object and the path. f = h5py.File(input, 'r') try: return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes) finally: f.close() # If we are here, `input` should be a Dataset object, which we can now # convert to a Table. # Create a Table object from astropy.table import Table, meta, serialize table = Table(np.array(input)) # Read the meta-data from the file. For back-compatibility, we can read # the old file format where the serialized metadata were saved in the # attributes of the HDF5 dataset. # In the new format, instead, metadata are stored in a new dataset in the # same file. This is introduced in Astropy 3.0 old_version_meta = META_KEY in input.attrs new_version_meta = path is not None and meta_path(path) in input_save if old_version_meta or new_version_meta: if new_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_save[meta_path(path)]) else: # Must be old_version_meta is True. if (A or B) and not A then B is True header = meta.get_header_from_yaml( h.decode('utf-8') for h in input.attrs[META_KEY]) if 'meta' in list(header.keys()): table.meta = header['meta'] header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. table = serialize._construct_mixins_from_columns(table) else: # Read the meta-data from the file table.meta.update(input.attrs) if not character_as_bytes: table.convert_bytestring_to_unicode() return table
def read_table_hdf5(input, path=None, character_as_bytes=True): """ Read a Table object from an HDF5 file This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one table is present in the HDF5 file or group, the first table is read in and a warning is displayed. Parameters ---------- input : str or :class:`h5py:File` or :class:`h5py:Group` or :class:`h5py:Dataset` If a string, the filename to read the table from. If an h5py object, either the file or the group object to read the table from. path : str The path from which to read the table inside the HDF5 file. This should be relative to the input file or group. character_as_bytes: boolean If `True` then Table columns are left as bytes. If `False` then Table columns are converted to unicode. """ try: import h5py except ImportError: raise Exception("h5py is required to read and write HDF5 files") # This function is iterative, and only gets to writing the file when # the input is an hdf5 Group. Moreover, the input variable is changed in # place. # Here, we save its value to be used at the end when the conditions are # right. input_save = input if isinstance(input, (h5py.File, h5py.Group)): # If a path was specified, follow the path if path is not None: try: input = input[path] except (KeyError, ValueError): raise OSError("Path {0} does not exist".format(path)) # `input` is now either a group or a dataset. If it is a group, we # will search for all structured arrays inside the group, and if there # is one we can proceed otherwise an error is raised. If it is a # dataset, we just proceed with the reading. if isinstance(input, h5py.Group): # Find all structured arrays in group arrays = _find_all_structured_arrays(input) if len(arrays) == 0: raise ValueError("no table found in HDF5 group {0}". format(path)) elif len(arrays) > 0: path = arrays[0] if path is None else path + '/' + arrays[0] if len(arrays) > 1: warnings.warn("path= was not specified but multiple tables" " are present, reading in first available" " table (path={0})".format(path), AstropyUserWarning) return read_table_hdf5(input, path=path) elif not isinstance(input, h5py.Dataset): # If a file object was passed, then we need to extract the filename # because h5py cannot properly read in file objects. if hasattr(input, 'read'): try: input = input.name except AttributeError: raise TypeError("h5py can only open regular files") # Open the file for reading, and recursively call read_table_hdf5 with # the file object and the path. f = h5py.File(input, 'r') try: return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes) finally: f.close() # If we are here, `input` should be a Dataset object, which we can now # convert to a Table. # Create a Table object from astropy.table import Table, meta, serialize table = Table(np.array(input)) # Read the meta-data from the file. For back-compatibility, we can read # the old file format where the serialized metadata were saved in the # attributes of the HDF5 dataset. # In the new format, instead, metadata are stored in a new dataset in the # same file. This is introduced in Astropy 3.0 old_version_meta = META_KEY in input.attrs new_version_meta = path is not None and meta_path(path) in input_save if old_version_meta or new_version_meta: if new_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input_save[meta_path(path)]) elif old_version_meta: header = meta.get_header_from_yaml( h.decode('utf-8') for h in input.attrs[META_KEY]) if 'meta' in list(header.keys()): table.meta = header['meta'] header_cols = dict((x['name'], x) for x in header['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Construct new table with mixins, using tbl.meta['__serialized_columns__'] # as guidance. table = serialize._construct_mixins_from_columns(table) else: # Read the meta-data from the file table.meta.update(input.attrs) if not character_as_bytes: table.convert_bytestring_to_unicode() return table
def read_table_parquet(input, include_names=None, exclude_names=None, schema_only=False, filters=None): """ Read a Table object from a Parquet file. This requires `pyarrow <https://arrow.apache.org/docs/python/>`_ to be installed. The ``filters`` parameter consists of predicates that are expressed in disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary boolean logical combinations of single column predicates. The innermost tuples each describe a single column predicate. The list of inner predicates is interpreted as a conjunction (AND), forming a more selective and multiple column predicate. Finally, the most outer list combines these filters as a disjunction (OR). Predicates may also be passed as List[Tuple]. This form is interpreted as a single conjunction. To express OR in predicates, one must use the (preferred) List[List[Tuple]] notation. Each tuple has format: (``key``, ``op``, ``value``) and compares the ``key`` with the ``value``. The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the ``value`` must be a collection such as a ``list``, a ``set`` or a ``tuple``. Examples: .. code-block:: python ('x', '=', 0) ('y', 'in', ['a', 'b', 'c']) ('z', 'not in', {'a','b'}) Parameters ---------- input : str or path-like or file-like object If a string or path-like object, the filename to read the table from. If a file-like object, the stream to read data. include_names : list [str], optional List of names to include in output. If not supplied, then include all columns. exclude_names : list [str], optional List of names to exclude from output (applied after ``include_names``). If not supplied then no columns are excluded. schema_only : bool, optional Only read the schema/metadata with table information. filters : list [tuple] or list [list [tuple] ] or None, optional Rows which do not match the filter predicate will be removed from scanned data. See `pyarrow.parquet.read_table()` for details. Returns ------- table : `~astropy.table.Table` Table will have zero rows and only metadata information if schema_only is True. """ pa, parquet, _ = get_pyarrow() if not isinstance(input, (str, os.PathLike)): # The 'read' attribute is the key component of a generic # file-like object. if not hasattr(input, 'read'): raise TypeError( "pyarrow can only open path-like or file-like objects.") schema = parquet.read_schema(input) # Pyarrow stores all metadata as byte-strings, so we convert # to UTF-8 strings here. if schema.metadata is not None: md = { k.decode('UTF-8'): v.decode('UTF-8') for k, v in schema.metadata.items() } else: md = {} from astropy.table import Table, meta, serialize # parse metadata from table yaml meta_dict = {} if 'table_meta_yaml' in md: meta_yaml = md.pop('table_meta_yaml').split('\n') meta_hdr = meta.get_header_from_yaml(meta_yaml) if 'meta' in meta_hdr: meta_dict = meta_hdr['meta'] else: meta_hdr = None # parse and set serialized columns full_table_columns = {name: name for name in schema.names} has_serialized_columns = False if '__serialized_columns__' in meta_dict: has_serialized_columns = True serialized_columns = meta_dict['__serialized_columns__'] for scol in serialized_columns: for name in _get_names(serialized_columns[scol]): full_table_columns[name] = scol use_names = set(full_table_columns.values()) # Apply include_names before exclude_names if include_names is not None: use_names.intersection_update(include_names) if exclude_names is not None: use_names.difference_update(exclude_names) # Preserve column ordering via list, and use this dict trick # to remove duplicates and preserve ordering (for mixin columns) use_names = list( dict.fromkeys( [x for x in full_table_columns.values() if x in use_names])) # names_to_read is a list of actual serialized column names, where # e.g. the requested name 'time' becomes ['time.jd1', 'time.jd2'] names_to_read = [] for name in use_names: names = [n for n, col in full_table_columns.items() if name == col] names_to_read.extend(names) if not names_to_read: raise ValueError("No include_names specified were found in the table.") # We need to pop any unread serialized columns out of the meta_dict. if has_serialized_columns: for scol in list(meta_dict['__serialized_columns__'].keys()): if scol not in use_names: meta_dict['__serialized_columns__'].pop(scol) # whether to return the whole table or a formatted empty table. if not schema_only: # Read the pyarrow table, specifying columns and filters. pa_table = parquet.read_table(input, columns=names_to_read, filters=filters) num_rows = pa_table.num_rows else: num_rows = 0 # Now need to convert parquet table to Astropy dtype = [] for name in names_to_read: # Pyarrow string and byte columns do not have native length information # so we must determine those here. if schema.field(name).type not in (pa.string(), pa.binary()): # Convert the pyarrow type into a numpy dtype (which is returned # by the to_pandas_type() method). dtype.append(schema.field(name).type.to_pandas_dtype()) continue # Special-case for string and binary columns md_name = f'table::len::{name}' if md_name in md: # String/bytes length from header. strlen = int(md[md_name]) elif schema_only: # Find the maximum string length. # Choose an arbitrary string length since # are not reading in the table. strlen = 10 warnings.warn( f"No {md_name} found in metadata. " f"Guessing {{strlen}} for schema.", AstropyUserWarning) else: strlen = max([len(row.as_py()) for row in pa_table[name]]) warnings.warn( f"No {md_name} found in metadata. " f"Using longest string ({{strlen}} characters).", AstropyUserWarning) dtype.append(f'U{strlen}' if schema.field(name).type == pa.string() else f'|S{strlen}') # Create the empty numpy record array to store the pyarrow data. data = np.zeros(num_rows, dtype=list(zip(names_to_read, dtype))) if not schema_only: # Convert each column in the pyarrow table to a numpy array for name in names_to_read: data[name][:] = pa_table[name].to_numpy() table = Table(data=data, meta=meta_dict) if meta_hdr is not None: # Set description, format, unit, meta from the column # metadata that was serialized with the table. header_cols = dict((x['name'], x) for x in meta_hdr['datatype']) for col in table.columns.values(): for attr in ('description', 'format', 'unit', 'meta'): if attr in header_cols[col.name]: setattr(col, attr, header_cols[col.name][attr]) # Convert all compound columns to astropy objects # (e.g. time.jd1, time.jd2 into a single time column) table = serialize._construct_mixins_from_columns(table) return table