def update_full_archfiles_db3(dat, logger, msid_files, opt): # Update the archfiles.db3 database to include the associated archive files server_file = msid_files['archfiles'].abs logger.debug(f'Updating {server_file}') def as_python(val): try: return val.item() except AttributeError: return val with timing_logger(logger, f'Updating {server_file}', 'info', 'info'): with DBI(dbi='sqlite', server=server_file) as db: for archfile in dat['archfiles']: vals = {name: as_python(archfile[name]) for name in archfile.dtype.names} logger.debug(f'Inserting {vals["filename"]}') if not opt.dry_run: try: db.insert(vals, 'archfiles') except sqlite3.IntegrityError as err: # Expected exception for archfiles already in the table assert 'UNIQUE constraint failed: archfiles.filename' in str(err) if not opt.dry_run: db.commit()
def add_cmd(self, **cmd): """ Add command in correct order to the commands list. TO DO: use scs and step for further sorting?? """ cmd_date = cmd["date"] logger.debug("Adding command %s", cmd) # Prevent adding command before current command since the command # interpreter is a one-pass process. if cmd_date < self.date: raise ValueError("cannot insert command {} prior to current command {}".format(cmd, self.curr_cmd)) # Insert command at first place where new command date is strictly # less than existing command date. This implementation is linear, and # could be improved, though in practice commands are often inserted # close to the original. cmds = self.cmds for i_cmd in xrange(self.i_cmd + 1, len(cmds)): if cmd_date < cmds[i_cmd]["date"]: cmds.insert(i_cmd, cmd) break else: cmds.append(cmd)
def _get_stat_data_from_archive(filename, stat, tstart, tstop, last_row1, logger): """ Return stat table rows in the range tstart <= time < tstop. Also returns the corresponding table row indexes. :param filename: HDF5 file to read :param stat: stat (5min or daily) :param tstart: min time :param tstop: max time :param last_row1: row1 for previous index table entry :param logger: logger :return: """ dt = STATS_DT[stat] logger.debug(f'_get_stat_data({filename}, {stat}, {DateTime(tstart).fits}, ' f'{DateTime(tstop).fits}, {last_row1})') with tables.open_file(filename, 'r') as h5: # Check if tstart is beyond the end of the table. If so, return an empty table table = h5.root.data last_index = table[-1]['index'] last_time = (last_index + 0.5) * dt if tstart > last_time: logger.debug(f'No available stats data {DateTime(tstart).fits} > ' f'{DateTime(last_time).fits} (returning empty table)') row0 = row1 = len(table) table_rows = table[row0:row1] else: # Compute approx number of rows from the end for tstart. Normally the index value # goes in lock step with row, but it can happen that an index is missed because of # missing data. But if we back up by delta_rows, we are guaranteed to get to at # least the row corresponding to tstart. delta_rows = int((last_time - tstart) / dt) + 10 times = (table[-delta_rows:]['index'] + 0.5) * dt # In the worst case of starting to sync a client archive for a rarely-sampled # content like cpe1eng or pcad7eng (AOSPASA2CV,) we need to include an extra ``dt`` # on both ends to ensure that the first / last rows are caught. If the last # full-res sample is either before or after the stat mid-point timestamp then # stat sample may get dropped. This happened in real life for AOSPASA2CV. # Having extra rows on front is OK because they just get clipped, and an extra # row on back is OK because of clipping on the next update (and in normal # processing we always want the sync archive to have all recent data). sub_row0, sub_row1 = np.searchsorted(times, [tstart - dt, tstop + dt]) sub_row_offset = len(table) - delta_rows row0 = sub_row0 + sub_row_offset row1 = sub_row1 + sub_row_offset # If we have the last value of row1 (from previous sync entry) then use # that instead of computed value for row0. if last_row1 is not None: row0 = last_row1 table_rows = table[row0:row1] # returns np.ndarray (structured array) return table_rows, row0, row1
def sync_full_archive(opt, msid_files, logger, content, index_tbl): """ Sync the archive for ``content``. :param opt: :param msid_files: :param logger: :param content: :param index_tbl: index of sync file entries :return: """ # Get the last row of data from the length of the TIME.col (or archfiles?) ft = fetch.ft ft['content'] = content ft['msid'] = 'TIME' ft['interval'] = 'full' # If no TIME.h5 file then no point in going further time_file = Path(msid_files['msid'].abs) if not time_file.exists(): logger.debug(f'Skipping full data for {content}: no {time_file} file') return logger.info('') logger.info(f'Processing full data for {content}') # Get the 0-based index of last available full data row with tables.open_file(str(time_file), 'r') as h5: last_row_idx = len(h5.root.data) - 1 # Look for index table rows that have new data => the row ends after the last existing # data. Note: row0 and row1 correspond to the slice row0:row1, so up to but # not including the row indexed row1 (0-based). So for 3 existing rows, # last_row_idx=2 so to get the new row with index=3 you need row1=4, or equivalently # row1 > n_rows. By def'n we know that row0 <= 3 at this point. ok = index_tbl['row1'] > last_row_idx + 1 if np.count_nonzero(ok) == 0: logger.info(f'No new sync data for {content}: no new rows in index table') index_tbl = index_tbl[ok] try: dats = get_full_data_sets(ft, index_tbl, logger, opt) except urllib.error.URLError as err: if 'timed out' in str(err): msg = f' ERROR: timed out getting full data for {content}' logger.error(msg) process_errors.append(msg) dats = [] else: raise if dats: dat, msids = concat_data_sets(dats, ['data', 'quality']) with DelayedKeyboardInterrupt(logger): update_full_h5_files(dat, logger, msid_files, msids, opt) update_full_archfiles_db3(dat, logger, msid_files, opt)
def main(): global opt, ft, msid_files, logger opt, args = get_options() ft = fetch.ft msid_files = pyyaks.context.ContextDict('add_derived.msid_files', basedir=opt.data_root) msid_files.update(file_defs.msid_files) logger = pyyaks.logger.get_logger(name='engarchive', level=pyyaks.logger.VERBOSE, format="%(asctime)s %(message)s") # Get the derived parameter classes dp_classes = (getattr(derived, x) for x in dir(derived) if x.startswith('DP_')) dp_classes = [ x for x in dp_classes if hasattr(x, '__base__') and issubclass(x, derived.DerivedParameter) ] content_defs = {} for dp_class in dp_classes: colname = dp_class.__name__.upper() dp = dp_class() content = dp.content if opt.content == [] or any( re.match(x + r'\d+', content) for x in opt.content): dpd = content_defs.setdefault(content, {}) dpd.setdefault('classes', {'TIME': None}) dpd['content'] = content dpd['classes'][colname] = dp_class dpd['mnf_step'] = dp.mnf_step dpd['time_step'] = dp.time_step for content, content_def in content_defs.items(): ft['content'] = content logger.info('CONTENT = {}'.format(content)) # Make content directory if not os.path.exists(msid_files['contentdir'].rel): logger.info('Making directory {}'.format( msid_files['contentdir'].rel)) os.mkdir(msid_files['contentdir'].rel) # Make the archfiles.db3 file (if needed) make_archfiles_db(msid_files['archfiles'].abs, content_def) for colname in content_def['classes']: ft['msid'] = colname logger.debug('MSID = {}'.format(colname)) # Create colnames and colnames_all pickle files (if needed) and add colname add_colname(msid_files['colnames'].rel, colname) add_colname(msid_files['colnames_all'].rel, colname) make_msid_file(colname, content, content_def) add_colname(msid_files['colnames_all'].rel, 'QUALITY')
def __set__(self, SC, value): date = SC.date logger.debug("%s %s=%s", date, self.name, value) self.value = value self.values.append(value) self.dates.resize(len(self.values)) self.dates[-1] = date SC.set_state_value(date, self.name, value)
def main(): global opt, ft, msid_files, logger opt, args = get_options() ft = fetch.ft msid_files = pyyaks.context.ContextDict('add_derived.msid_files', basedir=opt.data_root) msid_files.update(file_defs.msid_files) logger = pyyaks.logger.get_logger(name='engarchive', level=pyyaks.logger.VERBOSE, format="%(asctime)s %(message)s") # Get the derived parameter classes dp_classes = (getattr(derived, x) for x in dir(derived) if x.startswith('DP_')) dp_classes = [x for x in dp_classes if hasattr(x, '__base__') and issubclass(x, derived.DerivedParameter)] content_defs = {} for dp_class in dp_classes: colname = dp_class.__name__.upper() dp = dp_class() content = dp.content if opt.content == [] or any(re.match(x + r'\d+', content) for x in opt.content): dpd = content_defs.setdefault(content, {}) dpd.setdefault('classes', {'TIME': None}) dpd['content'] = content dpd['classes'][colname] = dp_class dpd['mnf_step'] = dp.mnf_step dpd['time_step'] = dp.time_step for content, content_def in content_defs.items(): ft['content'] = content logger.info('CONTENT = {}'.format(content)) # Make content directory if not os.path.exists(msid_files['contentdir'].rel): logger.info('Making directory {}'.format(msid_files['contentdir'].rel)) os.mkdir(msid_files['contentdir'].rel) # Make the archfiles.db3 file (if needed) make_archfiles_db(msid_files['archfiles'].abs, content_def) for colname in content_def['classes']: ft['msid'] = colname logger.debug('MSID = {}'.format(colname)) # Create colnames and colnames_all pickle files (if needed) and add colname add_colname(msid_files['colnames'].rel, colname) add_colname(msid_files['colnames_all'].rel, colname) make_msid_file(colname, content, content_def) add_colname(msid_files['colnames_all'].rel, 'QUALITY')
def _sync_stat_archive(opt, msid_files, logger, content, stat, index_tbl): """ Actual worker for syncing the stat archive for ``content``. """ # Get the last row of data from the length of the TIME.col (or archfiles?) ft = fetch.ft ft['content'] = content ft['interval'] = stat stats_dir = Path(msid_files['statsdir'].abs) if not stats_dir.exists(): logger.debug(f'Skipping {stat} data for {content}: no directory') return logger.info('') logger.info(f'Processing {stat} data for {content}') # Get the MSIDs that are in client archive msids = [str(fn.name)[:-3] for fn in stats_dir.glob('*.h5')] if not msids: logger.debug(f'Skipping {stat} data for {content}: no stats h5 files') return else: logger.debug(f'Stat msids are {msids}') last_date_id, last_date_id_file = get_last_date_id( msid_files, msids, stat, logger) logger.verbose(f'Got {last_date_id} as last date_id that was applied to archive') # Get list of applicable dat objects (new data, before opt.date_stop). Also # return ``date_id`` which is the date_id of the final data set in the list. # This will be written as the new ``last_date_id``. try: dats, date_id = get_stat_data_sets(ft, index_tbl, last_date_id, logger, opt) except urllib.error.URLError as err: if 'timed out' in str(err): msg = f' ERROR: timed out getting {stat} data for {content}' logger.error(msg) process_errors.append(msg) return else: raise if not dats: return dat, msids = concat_data_sets(dats, ['data']) with DelayedKeyboardInterrupt(logger): with timing_logger(logger, f'Applying updates to {len(msids)} h5 files'): for msid in msids: fetch.ft['msid'] = msid stat_file = msid_files['stats'].abs if os.path.exists(stat_file): append_stat_col(dat, stat_file, msid, date_id, opt, logger) logger.debug(f'Updating {last_date_id_file} with {date_id}') if not opt.dry_run: with open(last_date_id_file, 'w') as fh: fh.write(f'{date_id}')
def func_depend(func, *args, **kwargs): """ For (func, args, kwargs) input, func(*args, **kwargs) is evaluated and is evaluated in boolean context. For the ``depends`` list a func() return of False raises an exception indicating that the task dependencies are not met. For ``targets`` a func() return of False results in check_depend returning False. """ if isinstance(dep, (list, tuple)): func, args, kwargs = dep if func(*args, **kwargs): logger.debug('Func %s succeeded' % func.__name__) else: logger.debug('Func %s failed' % func.__name__) if deptype == 'depends': raise DependFuncFailure('Depend function %s false' % func.__name__) else: return False
def get_last_date_id(msid_files, msids, stat, logger): """ Get the last date_id used for syncing the client archive. First try the last_date_id file. If this does not exist then infer a reasonable value by looking at stat data for ``msids`` :param msid_files: :param msids: :param stat: :param logger: :return: """ last_date_id_file = msid_files['last_date_id'].abs if Path(last_date_id_file).exists(): logger.verbose(f'Reading {last_date_id_file} to get last update time') with open(last_date_id_file, 'r') as fh: last_date_id = fh.read() else: logger.verbose(f'Reading stat h5 files to get last update time') times = [] for msid in msids: fetch.ft['msid'] = msid filename = msid_files['stats'].abs logger.debug(f'Reading {filename} to check stat times') with tables.open_file(filename, 'r') as h5: index = h5.root.data.cols.index[-1] times.append((index + 0.5) * STATS_DT[stat]) # Get the least recent stats data available and then go back 5 days to be # sure nothing gets missed. Except for ephemeris files that are weird: # when they appear in the archive they include weeks of data in the past # and possibly future data. last_time = min(times) lookback = 30 if re.search(r'ephem[01]$', fetch.ft['content'].val) else 5 last_date_id = get_date_id(DateTime(last_time - lookback * 86400).fits) return last_date_id, last_date_id_file
def get_cmds(timeline_loads, mp_dir='/data/mpcrit1/mplogs'): """ Get backstop commands corresponding to the supplied timeline load segments. The timeline load segments must be ordered by 'id'. Return cmds in the format defined by Ska.ParseCM.read_backstop(). """ if np.min(np.diff(timeline_loads['id'])) < 1: raise ValueError('Timeline loads id not monotonically increasing') cmds = [] for tl in timeline_loads: bs_file = Ska.File.get_globfiles(os.path.join(mp_dir + tl.mp_dir, '*.backstop'))[0] if bs_file not in BACKSTOP_CACHE: bs_cmds = read_backstop(bs_file) logger.info('Read {} commands from {}'.format(len(bs_cmds), bs_file)) BACKSTOP_CACHE[bs_file] = bs_cmds else: bs_cmds = BACKSTOP_CACHE[bs_file] # Only store commands for this timeline (match SCS and date) bs_cmds = [x for x in bs_cmds if tl['datestart'] <= x['date'] <= tl['datestop'] and x['scs'] == tl['scs']] for bs_cmd in bs_cmds: bs_cmd['timeline_id'] = tl['id'] logger.info(' Got {} backstop commands for timeline_id={} and SCS={}' .format(len(bs_cmds), tl['id'], tl['scs'])) cmds.extend(bs_cmds) # Sort by date and SCS step number. cmds = sorted(cmds, key=lambda y: (y['date'], y['step'])) logger.debug('Read total of {} commands'.format(len(cmds))) return cmds
def append_stat_col(dat, stat_file, msid, date_id, opt, logger): """ Append ``dat`` to the appropriate stats h5 file. :param dat: :param stat_file: :param msid: :param date_id: :param opt: :param logger: :return: None """ vals = {key: dat[f'{msid}.{key}'] for key in ('data', 'row0', 'row1')} logger.debug(f'append_stat_col msid={msid} date_id={date_id}, ' f'row0,1 = {vals["row0"]} {vals["row1"]}') mode = 'r' if opt.dry_run else 'a' with tables.open_file(stat_file, mode=mode) as h5: last_row_idx = len(h5.root.data) - 1 # Check if there is any new data in this chunk if vals['row1'] - 1 <= last_row_idx: logger.debug(f'Skipping {date_id} for {msid}: no new data ' f'row1={vals["row1"]} last_row_idx={last_row_idx}') return # If this row begins before then end of current data then chop the # beginning of data for this row. if vals['row0'] <= last_row_idx: idx0 = last_row_idx + 1 - vals['row0'] logger.debug(f'Chopping {idx0 + 1} rows from data') vals['data'] = vals['data'][idx0:] vals['row0'] += idx0 if vals['row0'] != len(h5.root.data): raise RowMismatchError( f'ERROR: unexpected discontinuity for stat msid={msid} ' f'content={fetch.ft["content"]}\n' f'Looks like your archive is in a bad state, CONTACT ' f'your local Ska expert with this info:\n' f' First row0 in new data {vals["row0"]} != ' f'length of existing data {len(h5.root.data)}') logger.debug(f'Appending {len(vals["data"])} rows to {stat_file}') if not opt.dry_run: h5.root.data.append(vals['data'])
def check_depend(depends=None, targets=None): """Check that dependencies are satisfied. A dependency in the ``depends`` or ``targets`` list can be either a file name as a string or a renderable object (file or value) with an mtime attribute. A file name is treated in the usual sense of depend and target files. A missing depend file raises an exception and a missing target means check_depend returns False. In addition all targets must be newer than all depends. :param depends: list of file or value dependencies :param targets: list of file or value targets :returns: dependencies_satisfied, info_message """ # Lists of mod time for depend and target files. Seed the list with a # fake very OLD and NEW file (respectively) so the final min/max comparison # always works. mtimes = dict(depends = [1], targets = [2**31]) deptypes = dict(depends=depends, targets=targets) statuses = {} # Step through all depends and targets and determine existence and mod. time. # Collect this status and informational messages in statuses[deptype] for deptype in ('depends', 'targets'): statuses[deptype] = [] deps = deptypes[deptype] if not deps: continue for dep in deps: # Check if dep is not a ContextValue. If so interpret as a filename if not hasattr(dep, 'mtime'): dep = pyyaks.context.ContextValue(val=dep, name=dep, parent=pyyaks.context.ContextDict(basedir='.')) mtime = dep.mtime info = '%s %s %s = %s' % (deptype.title()[:-1], dep.type, dep.fullname, dep.abs) if mtime is None: statuses[deptype].append((False, info + ' does not exist')) else: statuses[deptype].append((True, info + ' (%s)' % time.ctime(mtime))) mtimes[deptype].append(mtime) # Do all depends exist? If not raise an exception which will trigger task failure if not all(x[0] for x in statuses['depends']): msg = 'Dependencies missing:\n' + '\n'.join(x[1] for x in statuses['depends']) logger.debug(msg) raise DependMissing(msg) # Do all targets exist? If not return False. This is a normal situation # before the task is run but will raise an exception after the task is run. if not all(x[0] for x in statuses['targets']): msg = 'Targets missing:\n' + '\n'.join(x[1] for x in statuses['targets']) logger.debug(msg) return False, msg # Are all targets as old as all depends? Allow for equality since target files could be # created within the same second (particularly for "touch" files). min_targets = min(mtimes['targets']) max_depends = max(mtimes['depends']) ok = min_targets >= max_depends msg = 'Depends and targets info:\n' if ok else 'Depend(s) are newer than target(s):\n' msg += '\n'.join(x[1] for x in (statuses['depends'] + statuses['targets'])) logger.debug(msg) return ok, msg
def teardown(self): for envvar in self.env: del os.environ[envvar] os.environ.update(self.origenv) logger.debug('Restored local environment')
def teardown(self): os.chdir(self.origdir) logger.debug('Restored directory to "%s"' % self.origdir)
def setup(self): self.origenv = os.environ.copy() os.environ.update(self.env) logger.debug('Updated local environment')
def update_sync_data_stat(content, logger, row, stat): """ Update stats (5min, daily) sync data for index table ``row`` :param content: content name (e.g. acis4eng) :param logger: logger :param row: one row of the full-res index table :param stat: stat interval (5min or daily) :return: """ ft = fetch.ft ft['interval'] = stat outfile = Path(sync_files['data'].abs) if outfile.exists(): logger.verbose(f'Skipping {outfile}, already exists') return # First get the times corresponding to row0 and row1 in the full resolution archive ft['msid'] = 'TIME' with tables.open_file(fetch.msid_files['msid'].abs, 'r') as h5: table = h5.root.data tstart = table[row['row0']] # Ensure that table row1 (for tstop) doesn't fall off the edge since the last # index file row will have row1 exactly equal to the table length. row1 = min(row['row1'], len(table) - 1) tstop = table[row1] out = {} msids = list(fetch.all_colnames[content] - set(fetch.IGNORE_COLNAMES)) # Get dict of last sync repo row for each MSID. This is keyed as {msid: last_row1}, # where row1 is (as always) the slice row1. last_rows_filename = sync_files['last_rows'].abs if Path(last_rows_filename).exists(): logger.verbose(f'Reading {last_rows_filename}') last_rows = pickle.load(open(last_rows_filename, 'rb')) else: last_rows = {} # Go through each MSID and get the raw HDF5 table data corresponding to the # time range tstart:tstop found above. n_rows_set = set() n_msids = 0 for msid in msids: last_row1 = last_rows.get(msid) ft['msid'] = msid filename = fetch.msid_files['stats'].abs if not Path(filename).exists(): logger.debug(f'No {stat} stat data for {msid} - skipping') continue n_msids += 1 stat_rows, row0, row1 = _get_stat_data_from_archive( filename, stat, tstart, tstop, last_row1, logger) logger.verbose(f'Got stat rows {row0} {row1} for stat {stat} {msid}') n_rows_set.add(row1 - row0) if row1 > row0: out[f'{msid}.data'] = stat_rows out[f'{msid}.row0'] = row0 out[f'{msid}.row1'] = row1 last_rows[msid] = row1 n_rows = n_rows_set.pop() if len(n_rows_set) == 1 else n_rows_set outfile.parent.mkdir(exist_ok=True, parents=True) # TODO: increase compression to max (gzip?) logger.info( f'Writing {outfile} with {n_rows} rows of data and {n_msids} msids') with gzip.open(outfile, 'wb') as fh: pickle.dump(out, fh) # Save the row1 value for each MSID to use as row0 for the next update logger.verbose(f'Writing {last_rows_filename}') with open(last_rows_filename, 'wb') as fh: pickle.dump(last_rows, fh)
def append_h5_col(opt, msid, vals, logger, msid_files): """Append new values to an HDF5 MSID data table. :param opt: :param msid: :param vals: dict with `data`, `quality`, `row0` and `row1` keys :param logger: :param msid_files: """ fetch.ft['msid'] = msid msid_file = Path(msid_files['msid'].abs) if not msid_file.exists(): logger.debug(f'Skipping MSID update no {msid_file}') return mode = 'r' if opt.dry_run else 'a' with tables.open_file(str(msid_file), mode=mode) as h5: # If the vals[] data begins before the end of current data then chop the # beginning of data for this row. last_row_idx = len(h5.root.data) - 1 if vals['row0'] <= last_row_idx: idx0 = last_row_idx + 1 - vals['row0'] logger.debug(f'Chopping {idx0 + 1} rows from data') for key in ('data', 'quality'): vals[key] = vals[key][idx0:] vals['row0'] += idx0 n_vals = len(vals['data']) logger.verbose(f'Appending {n_vals} rows to {msid_file}') # Normally at this point there is always data to append since we got here # by virtue of the TIME.h5 file being incomplete relative to available sync # data. However, user might have manually rsynced a file as part of adding # a new MSID, in which case it might be up to date and there is no req'd action. if n_vals == 0: return if vals['row0'] != len(h5.root.data): raise RowMismatchError( f'ERROR: unexpected discontinuity for full msid={msid} ' f'content={fetch.ft["content"]}\n' f'Looks like your archive is in a bad state, CONTACT ' f'your local Ska expert with this info:\n' f' First row0 in new data {vals["row0"]} != ' f'length of existing data {len(h5.root.data)}') # For the TIME column include special processing to effectively remove # existing rows that are superceded by new rows in time. This is done by # marking the TIME value as bad quality. This process happens regularly # for ephemeris content, which gets updated once weekly and has substantial # overlaps in the archive data. Here we only worry about the beginning of # new data because anything in the middle will have already been marked # bad by update_archive.py. if msid == 'TIME': time0 = vals['data'][0] idx1 = len(h5.root.data) - 1 ii = 0 while h5.root.data[idx1 - ii] - time0 > -0.0001: h5.root.quality[idx1 - ii] = True ii += 1 if ii > 0: logger.verbose(f'Excluded {ii} rows due to overlap') if not opt.dry_run: h5.root.data.append(vals['data']) h5.root.quality.append(vals['quality'])
def update_sync_data_full(content, logger, row): """ Update full-resolution sync data including archfiles for index table ``row`` This generates a gzipped pickle file with a dict that has sync update values for all available MSIDs in this chunk of ``content`` telemetry. This has `archfiles` (structured ndarray of rows) to store archfiles rows and then {msid}.quality, {msid}.data, {msid}.row0 and {msid}.row1. :param content: content type :param logger: global logger :param row: archfile row :return: None """ ft = fetch.ft ft['interval'] = 'full' outfile = Path(sync_files['data'].abs) if outfile.exists(): logger.verbose(f'Skipping {outfile}, already exists') return out = {} msids = list(fetch.all_colnames[content]) + ['TIME'] # row{filetime0} and row{filetime1} are the *inclusive* `filetime` stamps # for the archfiles to be included in this row. They do not overlap, so # the selection below must be equality. with DBI(dbi='sqlite', server=fetch.msid_files['archfiles'].abs) as dbi: query = (f'select * from archfiles ' f'where filetime >= {row["filetime0"]} ' f'and filetime <= {row["filetime1"]} ' f'order by filetime ') archfiles = dbi.fetchall(query) out['archfiles'] = archfiles # Row slice indexes into full-resolution MSID h5 files. All MSIDs share the # same row0:row1 range. row0 = row['row0'] row1 = row['row1'] # Go through each MSID and collect values n_msids = 0 for msid in msids: ft['msid'] = msid filename = fetch.msid_files['msid'].abs if not Path(filename).exists(): logger.debug(f'No MSID file for {msid} - skipping') continue n_msids += 1 with tables.open_file(filename, 'r') as h5: out[f'{msid}.quality'] = h5.root.quality[row0:row1] out[f'{msid}.data'] = h5.root.data[row0:row1] out[f'{msid}.row0'] = row0 out[f'{msid}.row1'] = row1 n_rows = row1 - row0 logger.info( f'Writing {outfile} with {n_rows} rows of data and {n_msids} msids') outfile.parent.mkdir(exist_ok=True, parents=True) # TODO: increase compression to max (gzip?) with gzip.open(outfile, 'wb') as fh: pickle.dump(out, fh)
def get_cmds(start, stop, mp_dir='/data/mpcrit1/mplogs'): """ Get backstop commands corresponding to the supplied timeline load segments. The timeline load segments must be ordered by 'id'. Return cmds in the format defined by Ska.ParseCM.read_backstop(). """ # Get timeline_loads within date range. Also get non-load commands # within the date range covered by the timelines. server = os.path.join(os.environ['SKA'], 'data', 'cmd_states', 'cmd_states.db3') with Ska.DBI.DBI(dbi='sqlite', server=server) as db: timeline_loads = db.fetchall("""SELECT * from timeline_loads WHERE datestop > '{}' AND datestart < '{}' ORDER BY id""" .format(start.date, stop.date)) # Get non-load commands (from autonomous or ground SCS107, NSM, etc) in the # time range that the timelines span. tl_datestart = min(timeline_loads['datestart']) nl_cmds = db.fetchall('SELECT * from cmds where timeline_id IS NULL and ' 'date >= "{}" and date <= "{}"' .format(tl_datestart, stop.date)) # Private method from cmd_states.py fetches the actual int/float param values # and returns list of dict. nl_cmds = _tl_to_bs_cmds(nl_cmds, None, db) nl_cmds = fix_nonload_cmds(nl_cmds) logger.info(f'Found {len(nl_cmds)} non-load commands between {tl_datestart} : {stop.date}') logger.info('Found {} timelines included within {} to {}' .format(len(timeline_loads), start.date, stop.date)) if np.min(np.diff(timeline_loads['id'])) < 1: raise ValueError('Timeline loads id not monotonically increasing') cmds = [] orbit_cmds = [] orbit_cmd_files = set() for tl in timeline_loads: bs_file = Ska.File.get_globfiles(os.path.join(mp_dir + tl.mp_dir, '*.backstop'))[0] if bs_file not in BACKSTOP_CACHE: bs_cmds = read_backstop(bs_file) logger.info('Read {} commands from {}'.format(len(bs_cmds), bs_file)) BACKSTOP_CACHE[bs_file] = bs_cmds else: bs_cmds = BACKSTOP_CACHE[bs_file] # Process ORBPOINT (orbit event) pseudo-commands in backstop. These # have scs=0 and need to be treated separately since during a replan # or shutdown we still want these ORBPOINT to be in the cmds archive # and not be excluded by timeline intervals. if bs_file not in orbit_cmd_files: bs_orbit_cmds = [x for x in bs_cmds if x['type'] == 'ORBPOINT'] for orbit_cmd in bs_orbit_cmds: orbit_cmd['timeline_id'] = tl['id'] if 'EVENT_TYPE' not in orbit_cmd['params']: orbit_cmd['params']['EVENT_TYPE'] = orbit_cmd['params']['TYPE'] del orbit_cmd['params']['TYPE'] orbit_cmds.extend(bs_orbit_cmds) orbit_cmd_files.add(bs_file) # Only store commands for this timeline (match SCS and date) bs_cmds = [x for x in bs_cmds if tl['datestart'] <= x['date'] <= tl['datestop'] and x['scs'] == tl['scs']] for bs_cmd in bs_cmds: bs_cmd['timeline_id'] = tl['id'] logger.info(' Got {} backstop commands for timeline_id={} and SCS={}' .format(len(bs_cmds), tl['id'], tl['scs'])) cmds.extend(bs_cmds) orbit_cmds = get_unique_orbit_cmds(orbit_cmds) logger.debug('Read total of {} orbit commands' .format(len(orbit_cmds))) cmds.extend(nl_cmds) cmds.extend(orbit_cmds) # Sort by date and SCS step number. cmds = sorted(cmds, key=lambda y: (y['date'], y['step'])) logger.debug('Read total of {} commands ({} non-load commands)' .format(len(cmds), len(nl_cmds))) return cmds
def update_index_file(index_file, opt, logger): """Update the top-level index file of data available in the sync archive :param index_file: Path of index ECSV file :param opt: options :param logger: output logger :return: index table (astropy Table) """ if index_file.exists(): # Start time of last update contained in the sync repo (if it exists), but do not look # back more than max_lookback days. This is relevant for rarely sampled # content like cpe1eng. filetime0 = (DateTime(opt.date_stop) - opt.max_lookback).secs index_tbl = Table.read(index_file) if len(index_tbl) == 0: # Need to start with a fresh index_tbl since the string column will end up # with a length=1 string (date_id) and add_row later will give the wrong result. index_tbl = None else: filetime0 = max(filetime0, index_tbl['filetime1'][-1]) else: # For initial index file creation use the --date-start option index_tbl = None filetime0 = DateTime(opt.date_start).secs max_secs = int(opt.max_days * 86400) time_stop = DateTime(opt.date_stop).secs # Step through the archfile files entries and collect them into groups of up # to --max-days based on file time stamp (which is an integer in CXC secs). rows = [] filename = fetch.msid_files['archfiles'].abs logger.debug(f'Opening archfiles {filename}') with DBI(dbi='sqlite', server=filename) as dbi: while True: filetime1 = min(filetime0 + max_secs, time_stop) logger.verbose(f'select from archfiles ' f'filetime > {DateTime(filetime0).fits[:-4]} {filetime0} ' f'filetime <= {DateTime(filetime1).fits[:-4]} {filetime1} ' ) archfiles = dbi.fetchall(f'select * from archfiles ' f'where filetime > {filetime0} ' f'and filetime <= {filetime1} ' f'order by filetime ') # Found new archfiles? If so get a new index table row for them. if len(archfiles) > 0: rows.append(get_row_from_archfiles(archfiles)) filedates = DateTime(archfiles['filetime']).fits logger.verbose(f'Got {len(archfiles)} archfiles rows from ' f'{filedates[0]} to {filedates[-1]}') filetime0 = filetime1 # Stop if already queried out to the end of desired time range if filetime1 >= time_stop: break if not rows: logger.info(f'No updates available for content {fetch.ft["content"]}') return index_tbl # Create table from scratch or add new rows. In normal processing there # will just be one row per run. if index_tbl is None: index_tbl = Table(rows) else: for row in rows: index_tbl.add_row(row) if not index_file.parent.exists(): logger.info(f'Making directory {index_file.parent}') index_file.parent.mkdir(exist_ok=True, parents=True) msg = check_index_tbl_consistency(index_tbl) if msg: msg += '\n' msg += '\n'.join(index_tbl.pformat(max_lines=-1, max_width=-1)) logger.error(f'Index table inconsistency: {msg}') return None logger.info(f'Writing {len(rows)} row(s) to index file {index_file}') index_tbl.write(index_file, format='ascii.ecsv') return index_tbl
def add_h5_cmds(h5file, idx_cmds): """ Add `idx_cmds` to HDF5 file `h5file` of indexed spacecraft commands. If file does not exist then create it. """ # Note: reading this file uncompressed is about 5 times faster, so sacrifice file size # for read speed and do not use compression. h5 = tables.open_file(h5file, mode='a') # Convert cmds (list of tuples) to numpy structured array. This also works for an # existing structured array. cmds = np.array(idx_cmds, dtype=CMDS_DTYPE) # TODO : make sure that changes in non-load commands triggers an update try: h5d = h5.root.data logger.info('Opened h5 cmds table {}'.format(h5file)) except tables.NoSuchNodeError: h5.create_table(h5.root, 'data', cmds, "cmds", expectedrows=2e6) logger.info('Created h5 cmds table {}'.format(h5file)) else: date0 = min(idx_cmd[1] for idx_cmd in idx_cmds) h5_date = h5d.cols.date[:] idx_recent = np.searchsorted(h5_date, date0) logger.info('Selecting commands from h5d[{}:]'.format(idx_recent)) logger.info(' {}'.format(str(h5d[idx_recent]))) h5d_recent = h5d[idx_recent:] # recent h5d entries # Define the column names that specify a complete and unique row key_names = ('date', 'type', 'tlmsid', 'scs', 'step', 'timeline_id', 'vcdu') h5d_recent_vals = [tuple( row[x].decode('ascii') if isinstance(row[x], bytes) else str(row[x]) for x in key_names) for row in h5d_recent] idx_cmds_vals = [tuple(str(x) for x in row[1:]) for row in idx_cmds] diff = difflib.SequenceMatcher(a=h5d_recent_vals, b=idx_cmds_vals, autojunk=False) blocks = diff.get_matching_blocks() logger.info('Matching blocks for existing HDF5 and timeline commands') for block in blocks: logger.info(' {}'.format(block)) opcodes = diff.get_opcodes() logger.info('Diffs between existing HDF5 and timeline commands') for opcode in opcodes: logger.info(' {}'.format(opcode)) # Find the first matching block that is sufficiently long for block in blocks: if block.size > MIN_MATCHING_BLOCK_SIZE: break else: raise ValueError('No matching blocks at least {} long' .format(MIN_MATCHING_BLOCK_SIZE)) # Index into idx_cmds at the end of the large matching block. block.b is the # beginning of the match. idx_cmds_idx = block.b + block.size if idx_cmds_idx < len(cmds): # Index into h5d at the point of the first diff after the large matching block h5d_idx = block.a + block.size + idx_recent if h5d_idx < len(h5d): logger.debug('Deleted relative cmds indexes {} .. {}'.format(h5d_idx - idx_recent, len(h5d) - idx_recent)) logger.debug('Deleted cmds indexes {} .. {}'.format(h5d_idx, len(h5d))) h5d.truncate(h5d_idx) h5d.append(cmds[idx_cmds_idx:]) logger.info('Added {} commands to HDF5 cmds table'.format(len(cmds[idx_cmds_idx:]))) else: logger.info('No new timeline commands, HDF5 cmds table not updated') h5.flush() logger.info('Upated HDF5 cmds table {}'.format(h5file)) h5.close()
def add_h5_cmds(h5file, idx_cmds): """ Add `idx_cmds` to HDF5 file `h5file` of indexed spacecraft commands. If file does not exist then create it. """ # Note: reading this file uncompressed is about 5 times faster, so sacrifice file size # for read speed and do not use compression. h5 = tables.openFile(h5file, mode='a') # Convert cmds (list of tuples) to numpy structured array. This also works for an # existing structured array. cmds = np.array(idx_cmds, dtype=CMDS_DTYPE) try: h5d = h5.root.data logger.info('Opened h5 cmds table {}'.format(h5file)) except tables.NoSuchNodeError: h5.createTable(h5.root, 'data', cmds, "cmds", expectedrows=2e6) logger.info('Created h5 cmds table {}'.format(h5file)) else: date0 = min(idx_cmd[1] for idx_cmd in idx_cmds) h5_date = h5d.cols.date[:] idx_recent = np.searchsorted(h5_date, date0) logger.info('Selecting commands from h5d[{}:]'.format(idx_recent)) logger.info(' {}'.format(str(h5d[idx_recent]))) h5d_recent = h5d[idx_recent:] # recent h5d entries # Define the column names that specify a complete and unique row key_names = ('date', 'type', 'tlmsid', 'scs', 'step', 'timeline_id') h5d_recent_vals = [tuple(str(row[x]) for x in key_names) for row in h5d_recent] idx_cmds_vals = [tuple(str(x) for x in row[1:]) for row in idx_cmds] diff = difflib.SequenceMatcher(a=h5d_recent_vals, b=idx_cmds_vals, autojunk=False) blocks = diff.get_matching_blocks() logger.info('Matching blocks for existing HDF5 and timeline commands') for block in blocks: logger.info(' {}'.format(block)) opcodes = diff.get_opcodes() logger.info('Diffs between existing HDF5 and timeline commands') for opcode in opcodes: logger.info(' {}'.format(opcode)) # Find the first matching block that is sufficiently long for block in blocks: if block.size > MIN_MATCHING_BLOCK_SIZE: break else: raise ValueError('No matching blocks at least {} long' .format(MIN_MATCHING_BLOCK_SIZE)) # Index into idx_cmds at the end of the large matching block. block.b is the # beginning of the match. idx_cmds_idx = block.b + block.size if idx_cmds_idx < len(cmds): # Index into h5d at the point of the first diff after the large matching block h5d_idx = block.a + block.size + idx_recent if h5d_idx < len(h5d): logger.debug('Deleted relative cmds indexes {} .. {}'.format(h5d_idx - idx_recent, len(h5d) - idx_recent)) logger.debug('Deleted cmds indexes {} .. {}'.format(h5d_idx, len(h5d))) h5d.truncate(h5d_idx) h5d.append(cmds[idx_cmds_idx:]) logger.info('Added {} commands to HDF5 cmds table'.format(len(cmds[idx_cmds_idx:]))) else: logger.info('No new timeline commands, HDF5 cmds table not updated') h5.flush() logger.info('Upated HDF5 cmds table {}'.format(h5file)) h5.close()