def _walk_files(self, root_name): rule_type = 'self_tree' for dir_name, subdirs, files in os.walk(root_name): for file_name in files: if self.config.signalled(): logging.info("signal set, leaving tp._walk_files") return done_with_file = False finfo = None processed_im_root_file = False full = dir_name + '/' + file_name if self._pass == 0: # might have metadata from an input mgr download if not dir_name.startswith(self.config.input): raise Exception("logic error pass 0") i_im = 0 for im in self.input_mgrs: finfo = im.get_downloaded_finfo(full) if finfo: if 'rules_run' in finfo and finfo['rules_run']: done_with_file = True else: processed_im_root_file = True finfo['source_im'] = i_im self.track_file(finfo) break i_im += 1 if not finfo: # this should be a file unpacked from a downloaded file if full in self.file_info: finfo = self.file_info[full] else: finfo = u.local_metadata(dir_name, file_name) self.track_file(finfo) elif full in self.file_info: finfo = self.file_info[full] else: # new file in output, created by an action finfo = u.local_metadata(dir_name, file_name) self.track_file(finfo) if self._always_unpack: self._unpack_if_archive(finfo) self._file_action(rule_type, finfo) if processed_im_root_file: Config.log(finfo['full'], tag='TP_IM_ROOT_FILE_PROCESSED') self._root_files_processed += 1 if self._root_files_processed >= self._root_file_limit: msg = "limit on root files processed per run (%i) reached." % self._root_file_limit logging.info(msg) self.config.add_to_final_summary(msg) return
def copy_with_metadata(self, finfo, dest): (dest_dir, dest_file) = os.path.split(dest) # prohibit destinations outside our output root rel_path = u.make_rel_path(self.config.output, dest_dir, strict=False) if not rel_path: logging.warning( "disallowing copy dest '%s', outside of output_root '%s'" % (dest_dir, self.config.output)) # no self-copy if dest == finfo['full']: logging.warning("not copying file '%s' onto itself!" % dest) return None u.ensure_path(dest_dir) Config.log("%s to %s" % (finfo['full'], dest), tag='COPY_WITH_METADATA') if not os.path.exists(finfo['full']): msg = "file '%s' does not exist" % finfo['full'] Config.log(msg, tag='COPY_WITH_METADATA_ERROR') return None shutil.copyfile(finfo['full'], dest) # set metadata for new file local = u.local_metadata(dest_dir, dest_file) newfi = copy.deepcopy(finfo) # TODO replace with unified metadata-copy system newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['size'] = local['size'] newfi['modified'] = local['modified'] # clear transient metadata not applicable to new file u.remove_no_copy_metadata(newfi) newfi['rules_run'] = False return newfi
def default_template_file_action(self, dir_name, file_name, dest_rel_path=None, dest_name=None): template_full = dir_name + '/' + file_name Config.log("default_template_file_action '%s'" % template_full, tag='DEFAULT_TEMPLATE_FILE_ACTION') if dest_name: rel_path = dest_rel_path dest_path = u.pathify(self.output_root, dest_rel_path) else: rel_path = u.make_rel_path(self.site_root, dir_name) dest_path = u.pathify(self.output_root, rel_path) dest_name = file_name u.ensure_path(dest_path) dest_full = u.pathify(dest_path, dest_name) info = { 'name': dest_name, 'path': dest_path, 'rel_path': rel_path, 'full': dest_full, 'key': u.make_key(rel_path, dest_name) } if self.config.is_template_type(file_name): template = open(template_full).read() output = u.debracket(template, self.interpret) if not self.config.is_special_file(info['key']): open(dest_full, 'w').write(output) local = u.local_metadata(dest_path, dest_name) info['size'] = local['size'] info['modified'] = local['modified'] info['md5'] = u.md5(dest_full) self.track_file(info) else: shutil.copyfile(template_full, dest_full) local = u.local_metadata(dest_path, dest_name) info['size'] = local['size'] info['modified'] = local['modified'] info['md5'] = u.md5(dest_full) self.track_file(info)
def metadata_from_local(self, clear_first=False): if clear_first: self._local_files.clear() # ftp does not look at subfolders for file_name in u.plain_files(self._dest_root, '.'): if file_name in self._local_files: continue # no overwrite finfo = u.local_metadata(self._dest_root, file_name) self._local_files[finfo['name']] = finfo self.write_metadata() msg = "im_meta_from_local completed for im '%s'" % self._label Config.log(msg, tag='FTP_META_FROM_LOCAL')
def metadata_from_local(self, clear_first=False): if clear_first: self._local_files.clear() for tag, settings in self._url_patterns.items(): for dir_name, subdirs, files in os.walk(settings['path']): for file_name in files: key_path = self._full_path_to_file_key_path(dir_name) file_key = os.path.join(key_path, file_name) if file_key in self._local_files: continue finfo = u.local_metadata(dir_name, file_name) finfo['file_key'] = file_key self._local_files[file_key] = finfo self.write_metadata() msg = "im_meta_from_local completed for im 'url'" Config.log(msg, tag='URL_META_FROM_LOCAL')
def get_tree_info(self, bucket=None, remote_root=''): self.tree_info.clear() for dir_name, subdirs, files in os.walk(self._file_dest_root): for file_name in files: full_src = dir_name + '/' + file_name rel_path = u.make_rel_path(self._file_dest_root, dir_name, strict=False, no_leading_slash=True) local_meta = u.local_metadata(dir_name, file_name) local_meta['key'] = u.make_key(rel_path, file_name) local_meta['md5'] = u.md5(full_src) # important: must never expose 'full' outside this class - it's a private # implementation detail. Same for 'path'. Only 'key' is public del local_meta['full'] del local_meta['path'] self.tree_info[local_meta['key']] = local_meta
def track_file(self, finfo): full = finfo['full'] Config.log(full, tag='TP_TRACK_FILE') if 'md5' not in finfo: finfo['md5'] = u.md5(finfo['full']) if full in self.file_info: # don't replace finfo unless file has actually changed old_finfo = self.file_info[full] if finfo['md5'] == old_finfo['md5']: Config.log(full, tag='TP_TRACK_FILE_UNCHANGED') return self.file_info[full] = finfo if self._track_file_callback and self.will_upload(finfo['full']): if 'rel_path' not in finfo: finfo['rel_path'] = u.make_rel_path(self.config.output, finfo['path'], no_leading_slash=True) if 'key' not in finfo: finfo['key'] = u.make_key(finfo['rel_path'], finfo['name']) if ('size' not in finfo) or ('modified' not in finfo): logging.error('track_file (%s): finfo missing size and/or modified' % finfo['full']) tmp = u.local_metadata(finfo['path'], finfo['name']) finfo['size'] = tmp['size'] finfo['modified'] = tmp['modified'] self._track_file_callback(finfo)
def sync_to_upstream_dest_mgr(self, upstream, refresh_me, refresh_upstream, tmp_folder, fixer=None): msg = "refresh_me = %s, refresh_upstream = %s, upstream root = '%s', my root = '%s', tmp_folder = %s" % \ (refresh_me, refresh_upstream, upstream._file_dest_root, self._file_dest_root, tmp_folder) Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_STARTING') do_refresh = {'refresh_dest_meta': True} dont_refresh = {'refresh_dest_meta': False} smart_refresh = {'refresh_dest_meta': True, 'skip_refresh_if_tree_unchanged': True} if refresh_me == 'full': self.sync_tree_info(options=do_refresh) elif refresh_me == 'smart': self.sync_tree_info(options=smart_refresh) else: self.sync_tree_info(options=dont_refresh) if refresh_upstream == 'full': upstream.sync_tree_info(options=do_refresh) elif refresh_upstream == 'smart': upstream.sync_tree_info(options=smart_refresh) else: upstream.sync_tree_info(options=dont_refresh) u.clear_folder(tmp_folder) start = u.timestamp_now() for key, finfo in upstream.tree_info_items(): src = os.path.join(upstream._file_dest_root, key) if not os.path.exists(src): msg = "file '%s' does not exist" % src Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_METADATA ERROR') continue if self.config.is_template_type(finfo['name']) and fixer: # copy and fix up dest = os.path.join(tmp_folder, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) fixer(dest) # make new metadata and copy to self newfi = copy.deepcopy(finfo) path, file = os.path.split(dest) local_meta = u.local_metadata(path, file) newfi['size'] = local_meta['size'] newfi['modified'] = local_meta['modified'] newfi['md5'] = u.md5(dest) self.tree_info[key] = newfi src = dest dest = os.path.join(self._file_dest_root, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) # we could remove fixed-up file now, but clear_folder at end probably faster else: # file not subject to fixup. just copy if missing/older/diff size copyit = False if key in self.tree_info: # compare metadata and see whether to copy myfi = self.tree_info[key] if myfi['md5'] != finfo['md5'] or \ myfi['modified'] < finfo['modified'] or \ myfi['size'] != finfo['size']: copyit = True else: copyit = True if copyit: # REVIEW - deepcopy probably safe here because we're copying from # one dest mgr to another self.tree_info[key] = copy.deepcopy(finfo) dest = os.path.join(self._file_dest_root, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) # delete from me if not in upstream to_delete = {} for key, finfo in self.tree_info_items(): if key not in upstream.tree_info: to_delete[key] = os.path.join(self._file_dest_root, finfo['key']) for key, full in to_delete.items(): os.remove(full) del self.tree_info[key] self.write_tree_info() # this is a space-saving move, but should be small, and might # be handy to have files around for debug. could be a config option. # u.clear_folder(tmp_folder) elapsed = u.timestamp_now() - start msg = "done, elapsed %f seconds" % elapsed Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_FINISHED')
def sync_tree_info(self, options=None): if self._synced_tree_info: Config.log(self.config_section, tag='FILE_DEST_META_ALREADY_SYNCED') return start = u.timestamp_now() logging.info("starting FileDest metadata sync") if options is None: options = self._default_sync_options # need to read persisted file, as that's the only place non-file-system metadata can live self.read_tree_info() # determine whether to force full refresh, only do it if tree has changed, on simply trust metadata: do_full_refresh = False if 'refresh_dest_meta' in options: do_full_refresh = options['refresh_dest_meta'] # computing all those md5s takes a long time, so optionally skip it if the most # recent modification time for _file_dest_root is unchanged since we last did it if 'skip_refresh_if_tree_unchanged' in options: last_mod = u.dir_last_modified(self._file_dest_root) expected_last_mod = self._tree_last_modified do_full_refresh = last_mod != expected_last_mod msg = "last_mod do_full_refresh = '%s', last_mod = '%f', expected_last_mod = '%f'" % ( do_full_refresh, last_mod, expected_last_mod) Config.log(msg, tag='FILE_DEST_META_TREE_UNCHANGED_TEST') if do_full_refresh: # physically walk the tree as it might not match persisted data for dir_name, subdirs, files in os.walk(self._file_dest_root): for file_name in files: full_src = dir_name + '/' + file_name setit = False rel_path = u.make_rel_path(self._file_dest_root, dir_name, strict=False, no_leading_slash=True) key = u.make_key(rel_path, file_name) local_meta = u.local_metadata(dir_name, file_name) local_meta['md5'] = u.md5(full_src) if key in self.tree_info: saved_meta = self.tree_info[key] if 'md5' not in saved_meta: saved_meta['md5'] = 'ERROR! md5 MISSING FROM tree_info!' if local_meta['md5'] == saved_meta['md5']: # sanity check if local_meta['size'] != saved_meta['size']: msg = "key '%s', saved: size %i, read: size %i" % ( key, saved_meta['size'], local_meta['size']) Config.log(msg, tag='FILE_DEST_META_ERROR_NONFATAL') # otherwise file is perfect, continue else: msg = "key '%s', md5 mismatch. saved: '%s', read: '%s'" % ( key, saved_meta['md5'], local_meta['md5']) Config.log(msg, tag='FILE_DEST_META_ERROR_FATAL') setit = True else: msg = "key '%s' not found in saved, adding" % key Config.log(msg, tag='FILE_DEST_META_NEW_FILE') setit = True if setit: local_meta['key'] = key self.tree_info[key] = local_meta # important: must never expose 'full' outside this class - it's a private # implementation detail. Same for 'path'. Only 'key' is public del local_meta['full'] del local_meta['path'] self.tree_info[key] = local_meta self.tree_info[key]['_found_file_'] = True missing = [] for key in self.tree_info: if '_found_file_' in self.tree_info[key]: del self.tree_info[key]['_found_file_'] else: missing.append(key) msg = "no file matching key '%s', deleting" % key Config.log(msg, tag='FILE_DEST_META_MISSING') for key in missing: del self.tree_info[key] self.write_tree_info() act = "completed" else: # trust the persisted file (faster) act = "bypassed" elapsed = u.timestamp_now() - start msg = "%s confirmation of tree info in %f seconds" % (act, elapsed) Config.log(msg, tag='FILE_DEST_SYNC') self._synced_tree_info = True
def upload_tree(self, local_root, remote_root='', options=None, local_tree_meta=None): logging.info("starting FileDest upload") if not options: options = {'use_md5': True} start = u.timestamp_now() self._upload_count = 0 # refresh and save data for files already on dest self.sync_tree_info(options=self._default_sync_options) for dir_name, subdirs, files in os.walk(local_root): rel_path = u.make_rel_path(local_root, dir_name) if not rel_path.startswith('/tmp'): for file_name in files: local_file = dir_name + '/' + file_name key = u.make_key(rel_path, file_name) local_md5 = u.md5(local_file) local_meta = None if local_tree_meta and local_file in local_tree_meta: local_meta = local_tree_meta[local_file] else: local_meta = u.local_metadata(dir_name, file_name) size = local_meta['size'] cached_info = None if key in self.tree_info: cached_info = self.tree_info[key] do_upload = True if 'use_md5' in options: if cached_info and not self.is_pending(key): if 'md5' in self.tree_info[key]: remote_md5 = self.tree_info[key]['md5'] do_upload = do_upload and remote_md5 != local_md5 else: err = "no md5 value for existing key '%s' (old version?)" % key logging.error(err) else: Config.log("file '%s' is not in FileDest" % key, tag='DEST_NO_EXISTING_FILE') if self._max_upload_size >= 0 and size > self._max_upload_size: logging.debug("file '%s' size (%i) > limit (%i), won't upload" % (key, size, self._max_upload_size)) do_upload = False if do_upload: extra_args = { 'Metadata': {'md5': local_md5} } logging.debug("FileDest object upload starting, key = '%s', %i bytes" % (key, size)) start = u.timestamp_now() self._upload(dir_name, file_name, key, extra_args=extra_args) rate = size / (u.timestamp_now() - start) Config.log("key = '%s', %f bytes/sec" % (key, rate), tag='FILE_DEST_UPLOAD_OK') # add metadata to our repos info = { 'new': True, 'name': file_name, 'rel_path': rel_path, 'key': key, 'size': local_meta['size'], 'modified': local_meta['modified'], # 'mod_dt': last_mod, # 'e_tag': obj.e_tag, 'md5': local_md5 } # transfer meta (e.g. thumbnail info) if exists if local_tree_meta and local_file in local_tree_meta: self.transfer_metadata(local_tree_meta[local_file], local_root=self.local_root, dest=info) self.tree_info[key] = info self._upload_count += 1 else: Config.log("key = '%s'" % key, tag='FILE_DEST_UPLOAD_NO_CHANGE') self.write_tree_info() elapsed = u.timestamp_now() - start logging.info("FileDest.upload_tree finished in %f seconds, uploaded %i files" % (elapsed, self._upload_count)) return self._upload_count
def panoply(interp, finfo, argstr): Config.log("file '%s' argstr '%s'" % (finfo['full'], argstr), tag='PANOPLY') try: interpret = u.interpret_method(interp) argstr = u.debracket(argstr, interpret, finfo=finfo) args = u.parse_arg_string(argstr) if not 'action' in args: raise Exception("panoply: 'action arg is required") if not 'dest' in args: raise Exception("panoply: dest arg is required") action = args['action'] dest = u.debracket(args['dest'], interpret, finfo=finfo) panoply_templates = Config.main.template_root + '/panoply' src = finfo['full'] if not src.endswith('.nc'): logging.error("panoply command: '%s' is not a dataset file" % src) return ### TODO more copied stuff from copy_with_metadata! (dest_dir, dest_file) = os.path.split(dest) u.ensure_path(dest_dir) jar = 'PanoplyCL.jar' size = None if 'size' in args: size = int(args['size']) size_factor = _panoply_size_to_size_factor(size) template_file = panoply_templates + '/' + action + '.pclt' try: template = open(template_file).read() except Exception as exc: logging.error( "panoply command: error '%s' opening template file '%s" % (str(exc), template_file)) raise symbols = { 'dataset': src, 'output_file': dest, 'size_factor': size_factor } script = u.debracket(template, interpret, symbols=symbols) script_path = Config.main.output + '/tmp' u.ensure_path(script_path) script_file = script_path + '/' + action + '.pcl' open(script_file, 'w').write(script) working_dir = Config.main.get('tools', 'panoply_workdir') if not os.path.isdir(working_dir): err = "panoply: invalid panoply_workdir '%s'" % working_dir logging.error(err) raise Exception(err) call_args = ['java', '-jar', jar, script_file] (returncode, stdout, stderr) = u.run_command(call_args, working_dir) logging.debug("returncode: %s\nstdout: %s\nstderr: %s" % (returncode, stdout, stderr)) if returncode == 0: # we know the file that was created, so make its metadata now newfi = {} newfi['name'] = dest_file newfi['path'] = dest_dir newfi['full'] = dest newfi['rules_run'] = False tmp = u.local_metadata(newfi['path'], newfi['name']) newfi['size'] = tmp['size'] newfi['modified'] = tmp['modified'] return {'new_finfo': newfi} else: logging.error("panoply failed with rc '%i', stderr = '%s'" % (returncode, stderr)) except Exception as exc: # py2 logging.error("panoply exception '%s'" % exc.message) logging.error("panoply exception '%s'" % str(exc))
def make_hybrid_forecast(webmaker, dest_key, work_item, list_args): try: Config.log("dest_key = '%s'" % dest_key, tag='HYBRID_FORECAST_START_' + dest_key) (dest_path, dest_name) = os.path.split(dest_key) finfos = work_item['roles']['default'] if not len(finfos): msg = "logic error, no finfos on default list for dest_key '%s'" % dest_key raise Exception(msg) archive_root = webmaker.config.get('local', 'archive', return_none=True) if not archive_root: logging.error("make_hybrid_forecast: fail, no archive_root configured") return 'todo error1' local_store_start = u.datestr_to_daynum(webmaker.config.get('local', 'local_store_start')) history_days = webmaker.config.get_int('tools', 'hybrid_forecast_history_days', default=20) history_frame0s = [] extract_frames = webmaker.config.plugin.get('extract_frames') combine_frames = webmaker.config.plugin.get('combine_frames') get_frame_count = webmaker.config.plugin.get('get_frame_count') # we need the current image for forecast frames, should be finfo current_image = finfos[0]['full'] if not current_image or not os.path.isfile(current_image): msg = "make_hybrid_forecast can't find expected current_image '%s'" % current_image logging.error(msg) return 'todo error2' frame_ct = get_frame_count(current_image) if frame_ct < 2: msg = "make_hybrid_forecast: current_image '%s' not multi-frame, taking no action" % current_image logging.warning(msg) return '' logging.info("got current_image file '%s'" % current_image) (current_path, current_name) = os.path.split(current_image) # look for frame0 files first in output, then archive roots = [webmaker.config.output + '/static', archive_root + '/output/static'] # find history files going back from date of current file if 'item_args' in work_item and 'date_string' in work_item['item_args']: date_str = work_item['item_args']['date_string'] else: msg = "make_hybrid_forecast can't find date_string" logging.error(msg) return 'todo error3' testdate = u.datestr_to_daynum(date_str) - 1 while True: date_str = u.daynum_to_datestr(testdate) # TODO what if it isn't a gif? also hard-coded convention fname = u.find_in_paths( current_name + '_frame_00000.gif', 'esrl-daily-forecasts/' + date_str + '/frame0', roots, options={'full': True} ) if fname: # logging.info("got frame0 file '%s'" % fname) Config.log(fname, tag='HYBRID_FORECAST_FRAME_GOT0') history_frame0s.insert(0, fname) else: Config.log("frames not contiguous by date, missing '%s'" % date_str, 'HYBRID_FORECAST_FRAME_ERR') if len(history_frame0s) >= history_days: break testdate -= 1 if testdate < local_store_start: msg = "make_hybrid_forecast only found %i history frames" % len(history_frame0s) break hist_frame_ct = len(history_frame0s) if not hist_frame_ct: msg = "key '%s': no history frames, can't proceed" % dest_key Config.log(msg, tag='HYBRID_FORECAST_ERROR') webmaker.config.add_to_final_summary(msg) return 'todo error4' # extract frames from current image work_path = webmaker.config.output + '/tmp/' + dest_name u.ensure_path(work_path) extract_frames(webmaker.interpret, finfos[0], 'out_dir:' + work_path) # make list of frame files for new image (TODO review path of tmp file) flist = work_path + '/' + dest_name + '.frames' content = "\n".join(history_frame0s) # add forecast frames for frame_num in range(1, frame_ct - 1): fname = work_path + '/' + current_name + '_frame_' + '{:05d}'.format(frame_num) + '.gif' content += '\n' + fname open(flist, 'w').write(content) # make output animation if 'delay' in list_args: delay = list_args['delay'] else: delay = webmaker.config.get('tools', 'default_frame_delay', return_none=True) combine_frames(flist, webmaker.output_root + '/' + dest_path, dest_name, delay=delay) # TODO use utility method?? display_name = dest_name if 'display_name' in finfos[0]: display_name = finfos[0]['display_name'] item_args = work_item['item_args'] if 'display_name' in item_args: display_name = u.parse_subargs( item_args['display_name'], webmaker.interpret, webmaker.config, finfos[0] ) # TODO REVIEW: we do this for panoply, should we here also? info = { 'name': dest_name, 'path': webmaker.output_root + '/' + dest_path, 'rel_path': dest_path, 'full': webmaker.output_root + '/' + dest_key, 'key': dest_key, 'display_name': display_name } tmp = u.local_metadata(info['path'], info['name']) info['size'] = tmp['size'] info['modified'] = tmp['modified'] webmaker.track_file(info) work_item['roles']['output'] = info os.remove(flist) Config.log("key '%s'" % dest_key, tag='HYBRID_FORECAST_SUCCESS') # TODO review - returning empty string because invoking from html return "" except Exception as exc: Config.log(str(exc), tag='HYBRID_FORECAST_EXCEPTION') raise