def contents(self): """A table summarizing the HDUs. """ if self._contents is None: self._contents = list() self._contents.append(self.contents_header) for k in range(self.nhdr): if 'EXTNAME' in self.headers[k]: extname = self.headers[k]['EXTNAME'].strip() else: extname = '' # # Don't complain about missing EXTNAME on primary, empty HDUs. # See https://github.com/desihub/desidatamodel/issues/69 # if k > 0: log.warning("HDU%d has no EXTNAME set!", k) if k > 0: if 'ZTENSION' in self.headers[k]: exttype = self.headers[k]['ZTENSION'].strip() else: exttype = self.headers[k]['XTENSION'].strip() else: exttype = 'IMAGE' self._contents.append((self.hduname.format(k)+'_', extname, exttype, '*Brief Description*')) return self._contents
def contents(self): """A table summarizing the HDUs. """ if self._contents is None: self._contents = list() self._contents.append(self.contents_header) for k in range(self.nhdr): if 'EXTNAME' in self.headers[k]: extname = self.headers[k]['EXTNAME'].strip() else: extname = '' # # Don't complain about missing EXTNAME on primary, empty HDUs. # See https://github.com/desihub/desidatamodel/issues/69 # if k > 0: log.warning("HDU%d has no EXTNAME set!", k) if k > 0: if 'ZTENSION' in self.headers[k]: exttype = self.headers[k]['ZTENSION'].strip() else: exttype = self.headers[k]['XTENSION'].strip() else: exttype = 'IMAGE' self._contents.append((self.hduname.format(k) + '_', extname, exttype, '*Brief Description*')) return self._contents
def update_truth(filepath, hdu=2, chunksize=50000, skip=('SLOPES', 'EMLINES')): """Add data from columns in other HDUs of the Truth table. Parameters ---------- filepath : :class:`str` Full path to the data file. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 2). chunksize : :class:`int`, optional If set, update database `chunksize` rows at a time (default 50000). skip : :func:`tuple`, optional Do not load columns with these names (default, ``('SLOPES', 'EMLINES')``) """ tcls = Truth tn = tcls.__tablename__ t = tcls.__table__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) log.info("Integrity check complete on %s.", tn) # if rowfilter is None: # good_rows = np.ones((maxrows,), dtype=np.bool) # else: # good_rows = rowfilter(data[0:maxrows]) # data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_list = [data[col].tolist() for col in colnames if col not in skip] data_names = [col.lower() for col in colnames if col not in skip] data_names[0] = 'b_targetid' finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) del data data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] q = t.update().where(t.c.targetid == bindparam('b_targetid')) if len(data_chunk) > 0: engine.execute(q, data_chunk) log.info("Updated %d rows in %s.", min((k+1)*chunksize, finalrows), tn)
def collect_files(root, files): """Scan a directory tree for files that correspond to data model files. Parameters ---------- root : :class:`str` Path to real files on disk. files : :class:`list` A list of data model files. Notes ----- Files are analyzed using this algorithm: * The first file that matches a regexp becomes the 'prototype' for that data model file. * If no files match a data model file, then files of that type are 'missing'. * If a file does not match any regular expression, it is 'extraneous'. * If a file matches a regular expression that already has a prototype, it is 'ignored'. """ ignore_directories = ('logs', 'scripts') include_extensions = ('.fits', '.fits.fz') for dirpath, dirnames, filenames in os.walk(root): for d in ignore_directories: try: dirnames.remove(d) except ValueError: pass include_filenames = list() for e in include_extensions: include_filenames += [f for f in filenames if f.endswith(e)] for f in include_filenames: extraneous_file = True fullname = os.path.join(dirpath, f) for r in files: if r.regexp is not None: m = r.regexp.match(fullname) if m is not None: extraneous_file = False if r.prototype is None: r.prototype = fullname if extraneous_file: log.warning("Extraneous file detected: %s", fullname) # # Scan for missing files, but don't penalize (here) data models that # don't have a valid regular expression. Files with bad regexeps will # be flagged elsewhere. # for r in files: if r.regexp is not None and r.prototype is None: log.warning("No files found matching %s!", r.filename) return
def extract_keywords(hdr): """Extract interesting keywords from a FITS header. Parameters ---------- hdr : :class:`~astropy.io.fits.Header` The header to parse. Returns ------- :class:`list` A list of tuples containing the metadata of interesting keywords. """ keywords = list() for key in hdr: if extrakey(key): # Escape &, <, >, in strings, but don't choke on int/float value = hdr[key] if isinstance(value, bool): ktype = 'bool' value = ('F', 'T')[int(value)] if isinstance(value, (str,)): value = escape(value) if value == 'T' or value == 'F': ktype = 'bool' else: ktype = 'str' if isinstance(value, int): value = str(value) ktype = 'int' if isinstance(value, float): value = str(value) ktype = 'float' if key.endswith('_'): key = key[0:len(key)-1] + '\\_' try: if value.endswith('_'): value = value[0:len(value)-1] + '\\_' except AttributeError: ktype = 'Unknown' log.warning("Raised AttributeError on %s = %s.", key, value) keywords.append((key, value, ktype, escape(hdr.comments[key]))) return keywords
def extract_keywords(hdr): """Extract interesting keywords from a FITS header. Parameters ---------- hdr : :class:`~astropy.io.fits.Header` The header to parse. Returns ------- :class:`list` A list of tuples containing the metadata of interesting keywords. """ keywords = list() for key in hdr: if extrakey(key): # Escape &, <, >, in strings, but don't choke on int/float value = hdr[key] if isinstance(value, bool): ktype = 'bool' value = ('F', 'T')[int(value)] if isinstance(value, (str, )): value = escape(value) if value == 'T' or value == 'F': ktype = 'bool' else: ktype = 'str' if isinstance(value, int): value = str(value) ktype = 'int' if isinstance(value, float): value = str(value) ktype = 'float' if key.endswith('_'): key = key[0:len(key) - 1] + '\\_' try: if value.endswith('_'): value = value[0:len(value) - 1] + '\\_' except AttributeError: ktype = 'Unknown' log.warning("Raised AttributeError on %s = %s.", key, value) keywords.append((key, value, ktype, escape(hdr.comments[key]))) return keywords
def hdumeta(self): """Metadata associated with each HDU. """ if self._hdumeta is None: self._hdumeta = list() for k in range(self.nhdr): meta = dict() meta['title'] = self.hduname.format(k) meta['extname'] = self.contents[k + 1][1] meta['keywords'] = extract_keywords(self.headers[k]) if 'XTENSION' in self.headers[k]: meta['extension'] = self.headers[k]['XTENSION'].strip() if meta['extension'] == 'IMAGE': meta['format'] = self.image_format(self.headers[k]) elif meta['extension'] == 'BINTABLE': try: meta['format'] = self.columns(k, self.error) except DataModelError: meta['format'] = self.image_format(self.headers[k]) try: meta['extension'] = self.headers[k][ 'ZTENSION'].strip() except KeyError: try: i = self.headers[k]['ZIMAGE'] if i: meta['extension'] = 'IMAGE' except KeyError: log.warning( "Possible malformed compressed data in HDU %d of %s.", k, self.filename) else: w = ("Unknown extension type: " + "{extension}.").format(**meta) meta['format'] = w log.warning(w) else: meta['extension'] = 'IMAGE' meta['format'] = self.image_format(self.headers[k]) self._hdumeta.append(meta) return self._hdumeta
def hdumeta(self): """Metadata associated with each HDU. """ if self._hdumeta is None: self._hdumeta = list() for k in range(self.nhdr): meta = dict() meta['title'] = self.hduname.format(k) meta['extname'] = self.contents[k+1][1] meta['keywords'] = extract_keywords(self.headers[k]) if 'XTENSION' in self.headers[k]: meta['extension'] = self.headers[k]['XTENSION'].strip() if meta['extension'] == 'IMAGE': meta['format'] = self.image_format(self.headers[k]) elif meta['extension'] == 'BINTABLE': try: meta['format'] = self.columns(k, self.error) except DataModelError: meta['format'] = self.image_format(self.headers[k]) try: meta['extension'] = self.headers[k]['ZTENSION'].strip() except KeyError: try: i = self.headers[k]['ZIMAGE'] if i: meta['extension'] = 'IMAGE' except KeyError: log.warning("Possible malformed compressed data in HDU %d of %s.", k, self.filename) else: w = ("Unknown extension type: " + "{extension}.").format(**meta) meta['format'] = w log.warning(w) else: meta['extension'] = 'IMAGE' meta['format'] = self.image_format(self.headers[k]) self._hdumeta.append(meta) return self._hdumeta
def check_unit(self, unit, error=False): """Check units for consistency with FITS standard, while allowing some special exceptions. Parameters ---------- unit : :class:`str` The unit to parse. error : :class:`bool`, optional If ``True``, failure to interpret the unit raises an exception. Returns ------- :class:`str` If a special exception is detected, the name of the unit is returned. Otherwise, ``None``. Raises ------ :exc:`ValueError` If `error` is set and the unit can't be parsed. """ try: au = Unit(unit, format='fits') except ValueError as e: bad_unit = str(e).split()[0] if any([u in bad_unit for u in self._acceptable_units]): return bad_unit else: if error: log.critical(str(e)) raise else: log.warning(str(e)) return None
def get_regexp(self, root, error=False): """Obtain the regular expression used to match files on disk. Parameters ---------- root : :class:`str` Path to real files on disk. error : :class:`bool`, optional If ``True``, failure to find a regular expression raises an exception instead of just a warning. Returns ------- regular expression The regular expression found, or ``None`` if not found. The regular expression is also stored internally. Raises ------ :exc:`~desimodel.DataModelError` If `error` is set and problems with the data model file are detected. """ with open(self.filename) as dm: for line in dm.readlines(): if line.startswith('See :doc:'): self.ref = self._cross_reference(line) log.debug("Cross reference detected %s -> %s.", self.filename, self.ref) break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace(self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None and self.ref is not None: with open(self.ref) as dm: for line in dm.readlines(): # # Hopefully cross-references are not nested. # # if line.startswith('See :doc:'): # self.ref = self._cross_reference(line) # break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace(self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None: m = "%s has no file regexp!" if error: log.critical(m, self.filename) raise DataModelError(m % self.filename) else: log.warning(m, self.filename) return self.regexp
def validate_prototype(self, error=False): """Compares a model's prototype data file to the data models. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Notes ----- * Use set theory to compare the data headers to model headers. This should automatically find missing headers, extraneous headers, etc. """ if self.prototype is None: # # A warning should have been issued already, so just skip silently. # return log.info("Comparing %s to %s.", self.prototype, self.filename) if self._stub is None: self._stub = Stub(self.prototype, error=error) stub_meta = self._stub_meta = self._stub.hdumeta modelmeta = self.extract_metadata(error=error) # # Check number of headers. # if self._stub.nhdr != len(modelmeta): log.warning("Prototype file %s has the wrong number of " + "sections (HDUs) according to %s.", self.prototype, self.filename) return for i in range(self._stub.nhdr): dkw = stub_meta[i]['keywords'] mkw = modelmeta[i]['keywords'] # # Check number of keywords. # if len(dkw) != len(mkw): log.warning("Prototype file %s has the wrong number of " + "HDU%d keywords according to %s.", self.prototype, i, self.filename) continue # # If number of keywords is correct, check them individually. # for j in range(len(dkw)): if dkw[j][0] != mkw[j][0]: log.warning("Prototype file %s has a keyword " + "mismatch (%s != %s) in HDU%d according to " + "%s.", self.prototype, dkw[j][0], mkw[j][0], i, self.filename) # # Check the extension type. # dex = stub_meta[i]['extension'] try: mex = modelmeta[i]['extension'] except KeyError: mex = "Extension type not found" if dex != mex: log.warning("Prototype file %s has an extension type " + "mismatch in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dex, mex, self.filename) continue # # Check for EXTNAME # dexex = stub_meta[i]['extname'] mexex = modelmeta[i]['extname'] if dexex == '' and i > 0: log.warning("Prototype file %s has no EXTNAME in HDU%d.", self.prototype, i) if (dexex != '' and mexex != '' and dexex != mexex): log.warning("Prototype file %s has an EXTNAME mismatch " + "in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dexex, mexex, self.filename) # # If the extension type is correct, check the contents of the # extension. # dexf = stub_meta[i]['format'] try: mexf = modelmeta[i]['format'] except KeyError: mexf = "Extension format not found" if dex == 'IMAGE': try: icomma = dexf.index(',') except ValueError: icomma = len(dexf) if dexf[:icomma] != mexf[:icomma]: log.warning("Prototype file %s has an extension " + "format mismatch in HDU%d " + "according to %s.", self.prototype, i, self.filename) else: dexf = dexf[1:] # Get rid of header line. if len(dexf) != len(mexf): log.warning("Prototype file %s has the wrong " + "number of HDU%d columns according to %s.", self.prototype, i, self.filename) else: for j in range(len(dexf)): if dexf[j][0] != mexf[j][0]: log.warning("Prototype file %s has a " + "column name mismatch (%s != %s) " + "in HDU%d according to %s.", self.prototype, dexf[j][0], mexf[j][0], i, self.filename) return
def extract_metadata(self, error=False): """Extract metadata from a data model file. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` Metadata in a form similar to :class:`~desidatamodel.stub.Stub` metadata. Raises ------ :exc:`~desidatamodel.DataModelError` If `error` is set and the HDU has no `EXTNAME` keyword. """ metafile = self.filename if self.ref is not None: metafile = self.ref if self._metafile_data is None: with open(metafile) as f: self._metafile_data = f.read() lines = self._metafile_data.split('\n') hdu_sections = [i for i, l in enumerate(lines) if (self._hduline.match(l) is not None or self._hduspan.match(l) is not None)] self.hdumeta = list() for k in range(len(hdu_sections)): try: section = lines[hdu_sections[k]:hdu_sections[k+1]] except IndexError: section = lines[hdu_sections[k]:] m = self._hduspan.match(section[0]) if m is not None: # # Detected HDU span. # g = m.groups() spanstart = int(g[0]) spanend = int(g[1]) log.debug('Detected range specification from HDU %d to HDU %d', spanstart, spanend) spanref = [l for l in section if l.startswith('Data:')][0] spanext = spanref[spanref.lower().index('see') + 4:].replace('.', '') spanmeta = [m for m in self.hdumeta if m['extname'] == spanext][0] spanname = [l.split('=')[1].strip() for l in section if l.startswith('EXTNAME = ')][0] extnames = [p.strip() for p in spanname.split(',')] if len(range(spanstart, spanend+1)) == len(extnames): for i, l in enumerate(range(spanstart, spanend+1)): meta = dict() meta['title'] = 'HDU{0:d}'.format(l) meta['extname'] = extnames[i] meta['extension'] = spanmeta['extension'] meta['format'] = spanmeta['format'] meta['keywords'] = spanmeta['keywords'] self.hdumeta.append(meta) else: log.warning(('Range specification from HDU %d to HDU %d ' + 'does not have a matching EXTNAME specification'), spanstart, spanend) continue meta = dict() meta['title'] = section[0] if 'Empty HDU.' in section: meta['extension'] = 'IMAGE' meta['format'] = 'Empty HDU.' image_data = [l for l in section if l.startswith('Data:')] if image_data: meta['extension'] = 'IMAGE' meta['format'] = image_data[0] try: rdtc = section.index('Required Data Table Columns') except ValueError: rdtc = None if rdtc is not None: meta['extension'] = 'BINTABLE' table = [i for i, l in enumerate(section[rdtc:]) if self._tableboundary.match(l) is not None][1:3] columns = list(map(len, section[rdtc:][table[0]].split())) table_lines = section[rdtc:][table[0]+1:table[1]] meta['format'] = [self._extract_columns(t, columns) for t in table_lines] for mk in meta['format']: if not mk[1]: m = "Missing type for column %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[2]: bad_unit = self.check_unit(mk[2], error=error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) try: rhk = section.index('Required Header Keywords') except ValueError: meta['keywords'] = [] else: table = [i for i, l in enumerate(section[rhk:]) if self._tableboundary.match(l) is not None][1:3] columns = list(map(len, section[rhk:][table[0]].split())) table_lines = section[rhk:][table[0]+1:table[1]] meta['keywords'] = [self._extract_columns(t, columns) for t in table_lines] for mk in meta['keywords']: if not mk[2]: m = "Missing type for keyword %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[0] == 'BUNIT': bad_unit = self.check_unit(mk[1], error=error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) # # Need to know the format by this point! # try: foo = meta['format'] except KeyError: m = "Unable to determine format for HDU %d in %s!" log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) # # See https://github.com/desihub/desidatamodel/issues/69 for # the detailed policy on EXTNAME. # try: meta['extname'] = [l.split()[2] for l in section if l.startswith('EXTNAME = ')][0] except IndexError: meta['extname'] = '' if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')): m = "HDU %d in %s has no EXTNAME!" if error: log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) else: log.warning(m, k, metafile) else: if k == 0 and meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: if k == 0: if meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: if meta['extname'] == 'PRIMARY': m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'." log.warning(m, k, metafile) self.hdumeta.append(meta) return self.hdumeta
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='NUMOBS_MORE'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'NUMOBS_MORE'). """ fiberpath = os.path.join(datapath, 'fiberassign*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/fiberassign\-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # fiberassign-TILEID.fits tileid = int( re.match('fiberassign\-(\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'FIBERASSIGN_X', 'FIBERASSIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid] * n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid' ] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings( FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows,), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows+1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k+1)*chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return
def extract_metadata(self, error=False): """Extract metadata from a data model file. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` Metadata in a form similar to :class:`~desidatamodel.stub.Stub` metadata. Raises ------ :exc:`~desidatamodel.DataModelError` If `error` is set and the HDU has no `EXTNAME` keyword. """ metafile = self.filename if self.ref is not None: metafile = self.ref if self._metafile_data is None: with open(metafile) as f: self._metafile_data = f.read() lines = self._metafile_data.split('\n') hdu_sections = [ i for i, l in enumerate(lines) if (self._hduline.match(l) is not None or self._hduspan.match(l) is not None) ] self.hdumeta = list() for k in range(len(hdu_sections)): try: section = lines[hdu_sections[k]:hdu_sections[k + 1]] except IndexError: section = lines[hdu_sections[k]:] m = self._hduspan.match(section[0]) if m is not None: # # Detected HDU span. # g = m.groups() spanstart = int(g[0]) spanend = int(g[1]) log.debug('Detected range specification from HDU %d to HDU %d', spanstart, spanend) spanref = [l for l in section if l.startswith('Data:')][0] spanext = spanref[spanref.lower().index('see') + 4:].replace( '.', '') spanmeta = [ m for m in self.hdumeta if m['extname'] == spanext ][0] spanname = [ l.split('=')[1].strip() for l in section if l.startswith('EXTNAME = ') ][0] extnames = [p.strip() for p in spanname.split(',')] if len(range(spanstart, spanend + 1)) == len(extnames): for i, l in enumerate(range(spanstart, spanend + 1)): meta = dict() meta['title'] = 'HDU{0:d}'.format(l) meta['extname'] = extnames[i] meta['extension'] = spanmeta['extension'] meta['format'] = spanmeta['format'] meta['keywords'] = spanmeta['keywords'] self.hdumeta.append(meta) else: log.warning( ('Range specification from HDU %d to HDU %d ' + 'does not have a matching EXTNAME specification'), spanstart, spanend) continue meta = dict() meta['title'] = section[0] if 'Empty HDU.' in section: meta['extension'] = 'IMAGE' meta['format'] = 'Empty HDU.' image_data = [l for l in section if l.startswith('Data:')] if image_data: meta['extension'] = 'IMAGE' meta['format'] = image_data[0] try: rdtc = section.index('Required Data Table Columns') except ValueError: rdtc = None if rdtc is not None: meta['extension'] = 'BINTABLE' table = [ i for i, l in enumerate(section[rdtc:]) if self._tableboundary.match(l) is not None ][1:3] columns = list(map(len, section[rdtc:][table[0]].split())) table_lines = section[rdtc:][table[0] + 1:table[1]] meta['format'] = [ self._extract_columns(t, columns) for t in table_lines ] for mk in meta['format']: if not mk[1]: m = "Missing type for column %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[2]: bad_unit = self.check_unit(mk[2], error=error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) try: rhk = section.index('Required Header Keywords') except ValueError: meta['keywords'] = [] else: table = [ i for i, l in enumerate(section[rhk:]) if self._tableboundary.match(l) is not None ][1:3] columns = list(map(len, section[rhk:][table[0]].split())) table_lines = section[rhk:][table[0] + 1:table[1]] meta['keywords'] = [ self._extract_columns(t, columns) for t in table_lines ] for mk in meta['keywords']: if not mk[2]: m = "Missing type for keyword %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[0] == 'BUNIT': bad_unit = self.check_unit(mk[1], error=error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) # # Need to know the format by this point! # try: foo = meta['format'] except KeyError: m = "Unable to determine format for HDU %d in %s!" log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) # # See https://github.com/desihub/desidatamodel/issues/69 for # the detailed policy on EXTNAME. # try: meta['extname'] = [ l.split()[2] for l in section if l.startswith('EXTNAME = ') ][0] except IndexError: meta['extname'] = '' if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')): m = "HDU %d in %s has no EXTNAME!" if error: log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) else: log.warning(m, k, metafile) else: if k == 0 and meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: # # If we reach here, meta['extname'] *is* defined. # if k == 0: if meta['format'] == 'Empty HDU.': if len(meta['keywords'] ) > 0 and meta['extname'] != 'PRIMARY': m = "HDU %d in %s has acceptable alternative EXTNAME = '%d'." log.debug(m, k, metafile, meta['extname']) else: if meta['extname'] == 'PRIMARY': m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'." log.warning(m, k, metafile) self.hdumeta.append(meta) return self.hdumeta
def validate_prototype(self, error=False): """Compares a model's prototype data file to the data models. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Notes ----- * Use set theory to compare the data headers to model headers. This should automatically find missing headers, extraneous headers, etc. """ if self.prototype is None: # # A warning should have been issued already, so just skip silently. # return log.info("Comparing %s to %s.", self.prototype, self.filename) if self._stub is None: self._stub = Stub(self.prototype, error=error) stub_meta = self._stub_meta = self._stub.hdumeta modelmeta = self.extract_metadata(error=error) # # Check number of headers. # if self._stub.nhdr != len(modelmeta): log.warning( "Prototype file %s has the wrong number of " + "sections (HDUs) according to %s.", self.prototype, self.filename) return for i in range(self._stub.nhdr): dkw = stub_meta[i]['keywords'] mkw = modelmeta[i]['keywords'] # # Check number of keywords. # if len(dkw) != len(mkw): log.warning( "Prototype file %s has the wrong number of " + "HDU%d keywords according to %s.", self.prototype, i, self.filename) continue # # If number of keywords is correct, check them individually. # for j in range(len(dkw)): if dkw[j][0] != mkw[j][0]: log.warning( "Prototype file %s has a keyword " + "mismatch (%s != %s) in HDU%d according to " + "%s.", self.prototype, dkw[j][0], mkw[j][0], i, self.filename) # # Check the extension type. # dex = stub_meta[i]['extension'] try: mex = modelmeta[i]['extension'] except KeyError: mex = "Extension type not found" if dex != mex: log.warning( "Prototype file %s has an extension type " + "mismatch in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dex, mex, self.filename) continue # # Check for EXTNAME # dexex = stub_meta[i]['extname'] mexex = modelmeta[i]['extname'] if dexex == '' and i > 0: log.warning("Prototype file %s has no EXTNAME in HDU%d.", self.prototype, i) if (dexex != '' and mexex != '' and dexex != mexex): log.warning( "Prototype file %s has an EXTNAME mismatch " + "in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dexex, mexex, self.filename) # # If the extension type is correct, check the contents of the # extension. # dexf = stub_meta[i]['format'] try: mexf = modelmeta[i]['format'] except KeyError: mexf = "Extension format not found" if dex == 'IMAGE': try: icomma = dexf.index(',') except ValueError: icomma = len(dexf) if dexf[:icomma] != mexf[:icomma]: log.warning( "Prototype file %s has an extension " + "format mismatch in HDU%d " + "according to %s.", self.prototype, i, self.filename) else: dexf = dexf[1:] # Get rid of header line. if len(dexf) != len(mexf): log.warning( "Prototype file %s has the wrong " + "number of HDU%d columns according to %s.", self.prototype, i, self.filename) else: for j in range(len(dexf)): if dexf[j][0] != mexf[j][0]: log.warning( "Prototype file %s has a " + "column name mismatch (%s != %s) " + "in HDU%d according to %s.", self.prototype, dexf[j][0], mexf[j][0], i, self.filename) return
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='SUBPRIORITY'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'BRICKNAME'). """ fiberpath = os.path.join(datapath, 'tile*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/tile-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # tile_TILEID.fits or tile-TILEID.fits tileid = int(re.match('tile[\-_](\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'DESIGN_X', 'DESIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid]*n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid'] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings(FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def get_regexp(self, root, error=False): """Obtain the regular expression used to match files on disk. Parameters ---------- root : :class:`str` Path to real files on disk. error : :class:`bool`, optional If ``True``, failure to find a regular expression raises an exception instead of just a warning. Returns ------- regular expression The regular expression found, or ``None`` if not found. The regular expression is also stored internally. Raises ------ :exc:`~desimodel.DataModelError` If `error` is set and problems with the data model file are detected. """ with open(self.filename) as dm: for line in dm.readlines(): if line.startswith('See :doc:'): self.ref = self._cross_reference(line) log.debug("Cross reference detected %s -> %s.", self.filename, self.ref) break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace( self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None and self.ref is not None: with open(self.ref) as dm: for line in dm.readlines(): # # Hopefully cross-references are not nested. # # if line.startswith('See :doc:'): # self.ref = self._cross_reference(line) # break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace( self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None: m = "%s has no file regexp!" if error: log.critical(m, self.filename) raise DataModelError(m % self.filename) else: log.warning(m, self.filename) return self.regexp
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows, ), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows + 1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows // chunksize + 1): data_chunk = [ dict(zip(data_names, row)) for row in data_rows[k * chunksize:(k + 1) * chunksize] ] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k + 1) * chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return