def main(): """Entry point for the check_model script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ from sys import argv from argparse import ArgumentParser desc = """Check actual files against the data model for validity. """ parser = ArgumentParser(description=desc, prog=os.path.basename(argv[0])) parser.add_argument('-d', '--datamodel-dir', dest='desidatamodel', metavar='DIR', help='Override the value of DESIDATAMODEL.') parser.add_argument('-F', '--compare-files', dest='files', action='store_true', help='Compare an individual data model to an individual file.') parser.add_argument('-W', '--warning-is-error', dest='error', action='store_true', help='Data model warnings raise exceptions.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Set log level to DEBUG.') parser.add_argument('section', metavar='DIR or FILE', help='Section of the data model or individual model file.') parser.add_argument('directory', metavar='DIR or FILE', help='Check files in this top-level directory, or one individual file.') options = parser.parse_args() if options.verbose: log.setLevel(DEBUG) if 'DESIDATAMODEL' in os.environ: data_model_root = os.environ['DESIDATAMODEL'] else: if options.desidatamodel is not None: data_model_root = options.desidatamodel else: log.critical(("DESIDATAMODEL is not defined. " + "Cannot find data model files!")) return 1 log.debug("DESIDATAMODEL=%s", data_model_root) if options.files: filename = os.path.join(data_model_root, 'doc', options.section) section = os.path.join(data_model_root, 'doc', options.section.split('/')[0]) log.info("Loading individual data model: %s.", filename) files = [DataModel(filename, section)] log.info("Skipping regular expression processing.") # files[0].get_regexp(options.directory, error=options.error) log.info("Setting prototype file for %s to %s.", filename, options.directory) files[0].prototype = options.directory else: section = os.path.join(data_model_root, 'doc', options.section) log.info("Loading data model file in %s.", section) files = scan_model(section) log.info("Searching for data files in %s.", options.directory) files_to_regexp(options.directory, files, error=options.error) log.info("Identifying prototype files in %s.", options.directory) collect_files(options.directory, files) validate_prototypes(files, error=options.error) return 0
def columns(self, hdu, error=False): """Describe the columns of a BINTABLE HDU. Parameters ---------- hdu : :class:`int` The HDU number (zero-indexed). error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` The rows of the table. Raises ------ :exc:`~desidatamodel.DataModelError` If the BINTABLE is actually a compressed image. :exc:`ValueError` If `error` and a ``TUNIT`` value does not have FITS-standard units. """ hdr = self.headers[hdu] if 'ZBITPIX' in hdr: raise DataModelError( "HDU{0:d} is actually a compressed image!".format(hdu)) ncol = hdr['TFIELDS'] c = list() c.append(self.columns_header) for j in range(ncol): jj = '{0:d}'.format(j + 1) name = hdr['TTYPE' + jj].strip() ttype = fits_column_format(hdr['TFORM' + jj].strip()) tunit = 'TUNIT' + jj if tunit in hdr: units = hdr[tunit].strip() bad_unit = self.check_unit(units, error=error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, j, hdu, self.filename) else: units = '' # Check TCOMMnn keyword, otherwise use TTYPE comment # for description. commkey = 'TCOMM' + jj if commkey in hdr: description = escape(hdr[commkey].strip()) else: description = escape(hdr.comments['TTYPE' + jj]) c.append((name, ttype, units, description)) return c
def columns(self, hdu, error=False): """Describe the columns of a BINTABLE HDU. Parameters ---------- hdu : :class:`int` The HDU number (zero-indexed). error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` The rows of the table. Raises ------ :exc:`~desidatamodel.DataModelError` If the BINTABLE is actually a compressed image. :exc:`ValueError` If `error` and a ``TUNIT`` value does not have FITS-standard units. """ hdr = self.headers[hdu] if 'ZBITPIX' in hdr: raise DataModelError("HDU{0:d} is actually a compressed image!".format(hdu)) ncol = hdr['TFIELDS'] c = list() c.append(self.columns_header) for j in range(ncol): jj = '{0:d}'.format(j+1) name = hdr['TTYPE'+jj].strip() ttype = fits_column_format(hdr['TFORM'+jj].strip()) tunit = 'TUNIT'+jj if tunit in hdr: units = hdr[tunit].strip() bad_unit = self.check_unit(units, error=error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, j, hdu, self.filename) else: units = '' # Check TCOMMnn keyword, otherwise use TTYPE comment # for description. commkey = 'TCOMM'+jj if commkey in hdr: description = escape(hdr[commkey].strip()) else: description = escape(hdr.comments['TTYPE'+jj]) c.append((name, ttype, units, description)) return c
def image_format(self, hdr): """Obtain format of an image HDU. Parameters ---------- hdr : :class:`~astropy.io.fits.Header` The header to parse. Returns ------- :class:`str` A string describing the image format. Raises ------ :exc:`~desidatamodel.DataModelError` If ``self.error`` is set a `BUNIT` header with units that do not follow the FITS standard is detected. """ n = hdr['NAXIS'] if n == 0: return 'Empty HDU.' bitmap = { 8: 'char', 16: 'int16', 32: 'int32', 64: 'int64', -32: 'float32', -64: 'float64' } if 'ZBITPIX' in hdr: n = hdr['ZNAXIS'] dims = [str(hdr['ZNAXIS{0:d}'.format(k + 1)]) for k in range(n)] try: datatype = bitmap[hdr['ZBITPIX']] + ' (compressed)' except KeyError: datatype = 'BITPIX={0} (compressed)'.format(hdr['ZBITPIX']) else: dims = [str(hdr['NAXIS{0:d}'.format(k + 1)]) for k in range(n)] try: datatype = bitmap[hdr['BITPIX']] except KeyError: datatype = 'BITPIX={}'.format(hdr['BITPIX']) if 'BUNIT' in hdr: log.debug("BUNIT = '%s'", hdr['BUNIT']) bad_unit = self.check_unit(hdr['BUNIT'], error=self.error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected in %s.", bad_unit, self.filename) return 'Data: FITS image [{0}, {1}]'.format(datatype, 'x'.join(dims))
def image_format(self, hdr): """Obtain format of an image HDU. Parameters ---------- hdr : :class:`~astropy.io.fits.Header` The header to parse. Returns ------- :class:`str` A string describing the image format. Raises ------ :exc:`~desidatamodel.DataModelError` If ``self.error`` is set a `BUNIT` header with units that do not follow the FITS standard is detected. """ n = hdr['NAXIS'] if n == 0: return 'Empty HDU.' bitmap = {8: 'char', 16: 'int16', 32: 'int32', 64: 'int64', -32: 'float32', -64: 'float64'} if 'ZBITPIX' in hdr: n = hdr['ZNAXIS'] dims = [str(hdr['ZNAXIS{0:d}'.format(k+1)]) for k in range(n)] try: datatype = bitmap[hdr['ZBITPIX']] + ' (compressed)' except KeyError: datatype = 'BITPIX={0} (compressed)'.format(hdr['ZBITPIX']) else: dims = [str(hdr['NAXIS{0:d}'.format(k+1)]) for k in range(n)] try: datatype = bitmap[hdr['BITPIX']] except KeyError: datatype = 'BITPIX={}'.format(hdr['BITPIX']) if 'BUNIT' in hdr: log.debug("BUNIT = '%s'", hdr['BUNIT']) bad_unit = self.check_unit(hdr['BUNIT'], error=self.error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected in %s.", bad_unit, self.filename) return 'Data: FITS image [{0}, {1}]'.format(datatype, 'x'.join(dims))
def get_regexp(self, root, error=False): """Obtain the regular expression used to match files on disk. Parameters ---------- root : :class:`str` Path to real files on disk. error : :class:`bool`, optional If ``True``, failure to find a regular expression raises an exception instead of just a warning. Returns ------- regular expression The regular expression found, or ``None`` if not found. The regular expression is also stored internally. Raises ------ :exc:`~desimodel.DataModelError` If `error` is set and problems with the data model file are detected. """ with open(self.filename) as dm: for line in dm.readlines(): if line.startswith('See :doc:'): self.ref = self._cross_reference(line) log.debug("Cross reference detected %s -> %s.", self.filename, self.ref) break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace( self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None and self.ref is not None: with open(self.ref) as dm: for line in dm.readlines(): # # Hopefully cross-references are not nested. # # if line.startswith('See :doc:'): # self.ref = self._cross_reference(line) # break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace( self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None: m = "%s has no file regexp!" if error: log.critical(m, self.filename) raise DataModelError(m % self.filename) else: log.warning(m, self.filename) return self.regexp
def main(): """Entry point for the check_model script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ from sys import argv from argparse import ArgumentParser desc = """Check actual files against the data model for validity. """ parser = ArgumentParser(description=desc, prog=os.path.basename(argv[0])) parser.add_argument('-d', '--datamodel-dir', dest='desidatamodel', metavar='DIR', help='Override the value of DESIDATAMODEL.') parser.add_argument( '-F', '--compare-files', dest='files', action='store_true', help='Compare an individual data model to an individual file.') parser.add_argument('-W', '--warning-is-error', dest='error', action='store_true', help='Data model warnings raise exceptions.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Set log level to DEBUG.') parser.add_argument( 'section', metavar='DIR or FILE', help='Section of the data model or individual model file.') parser.add_argument( 'directory', metavar='DIR or FILE', help='Check files in this top-level directory, or one individual file.' ) options = parser.parse_args() if options.verbose: log.setLevel(DEBUG) if 'DESIDATAMODEL' in os.environ: data_model_root = os.environ['DESIDATAMODEL'] else: if options.desidatamodel is not None: data_model_root = options.desidatamodel else: log.critical(("DESIDATAMODEL is not defined. " + "Cannot find data model files!")) return 1 log.debug("DESIDATAMODEL=%s", data_model_root) if options.files: filename = os.path.join(data_model_root, 'doc', options.section) section = os.path.join(data_model_root, 'doc', options.section.split('/')[0]) log.info("Loading individual data model: %s.", filename) files = [DataModel(filename, section)] log.info("Skipping regular expression processing.") # files[0].get_regexp(options.directory, error=options.error) log.info("Setting prototype file for %s to %s.", filename, options.directory) files[0].prototype = options.directory else: section = os.path.join(data_model_root, 'doc', options.section) log.info("Loading data model file in %s.", section) files = scan_model(section) log.info("Searching for data files in %s.", options.directory) files_to_regexp(options.directory, files, error=options.error) log.info("Identifying prototype files in %s.", options.directory) collect_files(options.directory, files) validate_prototypes(files, error=options.error) return 0
def extract_metadata(self, error=False): """Extract metadata from a data model file. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` Metadata in a form similar to :class:`~desidatamodel.stub.Stub` metadata. Raises ------ :exc:`~desidatamodel.DataModelError` If `error` is set and the HDU has no `EXTNAME` keyword. """ metafile = self.filename if self.ref is not None: metafile = self.ref if self._metafile_data is None: with open(metafile) as f: self._metafile_data = f.read() lines = self._metafile_data.split('\n') hdu_sections = [ i for i, l in enumerate(lines) if (self._hduline.match(l) is not None or self._hduspan.match(l) is not None) ] self.hdumeta = list() for k in range(len(hdu_sections)): try: section = lines[hdu_sections[k]:hdu_sections[k + 1]] except IndexError: section = lines[hdu_sections[k]:] m = self._hduspan.match(section[0]) if m is not None: # # Detected HDU span. # g = m.groups() spanstart = int(g[0]) spanend = int(g[1]) log.debug('Detected range specification from HDU %d to HDU %d', spanstart, spanend) spanref = [l for l in section if l.startswith('Data:')][0] spanext = spanref[spanref.lower().index('see') + 4:].replace( '.', '') spanmeta = [ m for m in self.hdumeta if m['extname'] == spanext ][0] spanname = [ l.split('=')[1].strip() for l in section if l.startswith('EXTNAME = ') ][0] extnames = [p.strip() for p in spanname.split(',')] if len(range(spanstart, spanend + 1)) == len(extnames): for i, l in enumerate(range(spanstart, spanend + 1)): meta = dict() meta['title'] = 'HDU{0:d}'.format(l) meta['extname'] = extnames[i] meta['extension'] = spanmeta['extension'] meta['format'] = spanmeta['format'] meta['keywords'] = spanmeta['keywords'] self.hdumeta.append(meta) else: log.warning( ('Range specification from HDU %d to HDU %d ' + 'does not have a matching EXTNAME specification'), spanstart, spanend) continue meta = dict() meta['title'] = section[0] if 'Empty HDU.' in section: meta['extension'] = 'IMAGE' meta['format'] = 'Empty HDU.' image_data = [l for l in section if l.startswith('Data:')] if image_data: meta['extension'] = 'IMAGE' meta['format'] = image_data[0] try: rdtc = section.index('Required Data Table Columns') except ValueError: rdtc = None if rdtc is not None: meta['extension'] = 'BINTABLE' table = [ i for i, l in enumerate(section[rdtc:]) if self._tableboundary.match(l) is not None ][1:3] columns = list(map(len, section[rdtc:][table[0]].split())) table_lines = section[rdtc:][table[0] + 1:table[1]] meta['format'] = [ self._extract_columns(t, columns) for t in table_lines ] for mk in meta['format']: if not mk[1]: m = "Missing type for column %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[2]: bad_unit = self.check_unit(mk[2], error=error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) try: rhk = section.index('Required Header Keywords') except ValueError: meta['keywords'] = [] else: table = [ i for i, l in enumerate(section[rhk:]) if self._tableboundary.match(l) is not None ][1:3] columns = list(map(len, section[rhk:][table[0]].split())) table_lines = section[rhk:][table[0] + 1:table[1]] meta['keywords'] = [ self._extract_columns(t, columns) for t in table_lines ] for mk in meta['keywords']: if not mk[2]: m = "Missing type for keyword %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[0] == 'BUNIT': bad_unit = self.check_unit(mk[1], error=error) if bad_unit: log.debug( "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) # # Need to know the format by this point! # try: foo = meta['format'] except KeyError: m = "Unable to determine format for HDU %d in %s!" log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) # # See https://github.com/desihub/desidatamodel/issues/69 for # the detailed policy on EXTNAME. # try: meta['extname'] = [ l.split()[2] for l in section if l.startswith('EXTNAME = ') ][0] except IndexError: meta['extname'] = '' if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')): m = "HDU %d in %s has no EXTNAME!" if error: log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) else: log.warning(m, k, metafile) else: if k == 0 and meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: # # If we reach here, meta['extname'] *is* defined. # if k == 0: if meta['format'] == 'Empty HDU.': if len(meta['keywords'] ) > 0 and meta['extname'] != 'PRIMARY': m = "HDU %d in %s has acceptable alternative EXTNAME = '%d'." log.debug(m, k, metafile, meta['extname']) else: if meta['extname'] == 'PRIMARY': m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'." log.warning(m, k, metafile) self.hdumeta.append(meta) return self.hdumeta
def get_regexp(self, root, error=False): """Obtain the regular expression used to match files on disk. Parameters ---------- root : :class:`str` Path to real files on disk. error : :class:`bool`, optional If ``True``, failure to find a regular expression raises an exception instead of just a warning. Returns ------- regular expression The regular expression found, or ``None`` if not found. The regular expression is also stored internally. Raises ------ :exc:`~desimodel.DataModelError` If `error` is set and problems with the data model file are detected. """ with open(self.filename) as dm: for line in dm.readlines(): if line.startswith('See :doc:'): self.ref = self._cross_reference(line) log.debug("Cross reference detected %s -> %s.", self.filename, self.ref) break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace(self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None and self.ref is not None: with open(self.ref) as dm: for line in dm.readlines(): # # Hopefully cross-references are not nested. # # if line.startswith('See :doc:'): # self.ref = self._cross_reference(line) # break if self._regexpline.match(line) is not None: d = os.path.dirname(self.filename).replace(self.section, root) for k in self._d2r: d = d.replace(k, self._d2r[k]) r = line.strip().split()[1].replace('``', '') self.regexp = re.compile(os.path.join(d, r)) break if self.regexp is None: m = "%s has no file regexp!" if error: log.critical(m, self.filename) raise DataModelError(m % self.filename) else: log.warning(m, self.filename) return self.regexp
def extract_metadata(self, error=False): """Extract metadata from a data model file. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Returns ------- :class:`list` Metadata in a form similar to :class:`~desidatamodel.stub.Stub` metadata. Raises ------ :exc:`~desidatamodel.DataModelError` If `error` is set and the HDU has no `EXTNAME` keyword. """ metafile = self.filename if self.ref is not None: metafile = self.ref if self._metafile_data is None: with open(metafile) as f: self._metafile_data = f.read() lines = self._metafile_data.split('\n') hdu_sections = [i for i, l in enumerate(lines) if (self._hduline.match(l) is not None or self._hduspan.match(l) is not None)] self.hdumeta = list() for k in range(len(hdu_sections)): try: section = lines[hdu_sections[k]:hdu_sections[k+1]] except IndexError: section = lines[hdu_sections[k]:] m = self._hduspan.match(section[0]) if m is not None: # # Detected HDU span. # g = m.groups() spanstart = int(g[0]) spanend = int(g[1]) log.debug('Detected range specification from HDU %d to HDU %d', spanstart, spanend) spanref = [l for l in section if l.startswith('Data:')][0] spanext = spanref[spanref.lower().index('see') + 4:].replace('.', '') spanmeta = [m for m in self.hdumeta if m['extname'] == spanext][0] spanname = [l.split('=')[1].strip() for l in section if l.startswith('EXTNAME = ')][0] extnames = [p.strip() for p in spanname.split(',')] if len(range(spanstart, spanend+1)) == len(extnames): for i, l in enumerate(range(spanstart, spanend+1)): meta = dict() meta['title'] = 'HDU{0:d}'.format(l) meta['extname'] = extnames[i] meta['extension'] = spanmeta['extension'] meta['format'] = spanmeta['format'] meta['keywords'] = spanmeta['keywords'] self.hdumeta.append(meta) else: log.warning(('Range specification from HDU %d to HDU %d ' + 'does not have a matching EXTNAME specification'), spanstart, spanend) continue meta = dict() meta['title'] = section[0] if 'Empty HDU.' in section: meta['extension'] = 'IMAGE' meta['format'] = 'Empty HDU.' image_data = [l for l in section if l.startswith('Data:')] if image_data: meta['extension'] = 'IMAGE' meta['format'] = image_data[0] try: rdtc = section.index('Required Data Table Columns') except ValueError: rdtc = None if rdtc is not None: meta['extension'] = 'BINTABLE' table = [i for i, l in enumerate(section[rdtc:]) if self._tableboundary.match(l) is not None][1:3] columns = list(map(len, section[rdtc:][table[0]].split())) table_lines = section[rdtc:][table[0]+1:table[1]] meta['format'] = [self._extract_columns(t, columns) for t in table_lines] for mk in meta['format']: if not mk[1]: m = "Missing type for column %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[2]: bad_unit = self.check_unit(mk[2], error=error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) try: rhk = section.index('Required Header Keywords') except ValueError: meta['keywords'] = [] else: table = [i for i, l in enumerate(section[rhk:]) if self._tableboundary.match(l) is not None][1:3] columns = list(map(len, section[rhk:][table[0]].split())) table_lines = section[rhk:][table[0]+1:table[1]] meta['keywords'] = [self._extract_columns(t, columns) for t in table_lines] for mk in meta['keywords']: if not mk[2]: m = "Missing type for keyword %s in HDU %d of %s!" if error: log.critical(m, mk[0], k, metafile) raise DataModelError(m % (mk[0], k, metafile)) else: log.warning(m, mk[0], k, metafile) if mk[0] == 'BUNIT': bad_unit = self.check_unit(mk[1], error=error) if bad_unit: log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.", bad_unit, mk[0], k, metafile) # # Need to know the format by this point! # try: foo = meta['format'] except KeyError: m = "Unable to determine format for HDU %d in %s!" log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) # # See https://github.com/desihub/desidatamodel/issues/69 for # the detailed policy on EXTNAME. # try: meta['extname'] = [l.split()[2] for l in section if l.startswith('EXTNAME = ')][0] except IndexError: meta['extname'] = '' if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')): m = "HDU %d in %s has no EXTNAME!" if error: log.critical(m, k, metafile) raise DataModelError(m % (k, metafile)) else: log.warning(m, k, metafile) else: if k == 0 and meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: if k == 0: if meta['format'] == 'Empty HDU.': if len(meta['keywords']) > 0: m = "HDU %d in %s should have EXTNAME = 'PRIMARY'." log.warning(m, k, metafile) else: if meta['extname'] == 'PRIMARY': m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'." log.warning(m, k, metafile) self.hdumeta.append(meta) return self.hdumeta
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ( 'coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9', ) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0] * len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings( ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows, ), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows + 1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows // chunksize + 1): data_chunk = [ dict(zip(data_names, row)) for row in data_rows[k * chunksize:(k + 1) * chunksize] ] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k + 1) * chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ('coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9',) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0]*len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings(ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows,), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows+1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k+1)*chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return