def sync_tree_info(self, options=None): if self._synced_tree_info: logging.debug("skipping sync_tree_info, already done") return if options is None: options = self._default_sync_options if 'refresh_dest_meta' in options: do_full_refresh = options['refresh_dest_meta'] if do_full_refresh: start = u.timestamp_now() logging.info("starting s3 metadata sync") if not options: options = {} self.get_tree_info() self.write_tree_info() elapsed = u.timestamp_now() - start logging.info("S3.sync_tree_info finished in %f seconds" % elapsed) self._synced_tree_info = True else: # read local S3 tree info (much faster) start = u.timestamp_now() if not options: options = {} self.read_tree_info() elapsed = u.timestamp_now() - start logging.info( "S3 tree metadata loaded from local file in %f seconds" % elapsed) self._synced_tree_info = True
def test_age_file(): test = '/home/floeuser/test_age_file' dest = test + '/' + 'aged' u.ensure_path(dest) src = test + '/' + 'foo.bar' with open(src, 'w') as fh: fh.write('foo.bar') for i in range(1, 510): start = u.timestamp_now() u.age_file(src, dest) print("iter %i took %f seconds" % (i, u.timestamp_now() - start))
def file_dest_speed_test(dest_mgr, size): name = "tempfile.text" with open(name, "wb") as fh: fh.write(b"\0" * size) start = u.timestamp_now() dest_mgr.upload_finfo(".", name, name) elapsed = u.timestamp_now() - start rate = size / elapsed logging.debug("FileDest object uploaded, key = '%s', %i bytes, %f sec, %f bytes/sec" % (name, size, elapsed, rate))
def speed_test(self, size): name = "tempfile.text" with open(name, "wb") as fh: # py2 fh.write("\0" * size) fh.write(b"\0" * size) start = u.timestamp_now() self.upload(".", name, name) elapsed = u.timestamp_now() - start rate = size / elapsed logging.debug( "S3 object uploaded, key = '%s', %i bytes, %f sec, %f bytes/sec" % (name, size, elapsed, rate))
def process(self, do_clear_info=True): logging.info("starting tree processing") start = u.timestamp_now() u.ensure_path(self.config.output) self._root_files_processed = 0 if self.clear_first and do_clear_info: u.clear_folder(self.config.output) # do this at start in case last run didn't clean up properly self.remove_unpacked_files() if do_clear_info: self.file_info.clear() self._pass = 0 # pass number self._files_processed = 0 # make one pass over the input files. if you need to know whether this is # the input pass, check for self._pass == 0. self._walk_files(self.config.input) if self.config.signalled(): logging.info("signal set, leaving tp.process") return False # then make passes over the output files until no new files are encountered work_done = self._files_processed > 0 Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_0') # do NOT look at _root_files_processed after pass 0 - we want to fully # process any files created during pass 0 while self._pass < self.PASSES: self._files_processed = 0 self._pass += 1 self._walk_files(self.config.output) if self.config.signalled(): logging.info("signal set, leaving tp.process after pass %i" % self._pass) work_done = False break Config.log('tp._files_processed = %i' % self._files_processed, tag='WORK_DONE_PASS_%i' % self._pass) if self._files_processed > 0: work_done = True else: break if self._pass >= self.PASSES: raise Exception("completed %i passes and still not done. failing" % self.PASSES) self.update_input_mgr_metadata() elapsed = u.timestamp_now() - start Config.log("tp completed in %i passes, %f seconds, work_done %s" % (self._pass, elapsed, work_done), tag='WORK_DONE') return work_done
def update_products(self): logging.info("starting FTP update from %s" % self._label) start = u.timestamp_now() self._download_count = 0 self._ftp = ftplib.FTP(self._site, self._login, self._password) self._ftp.cwd(self._folder) # catchup mode means get the file names and metadata for the FTP folder, # and write it to _ftp_dirlist.txt, but don't copy any files. then, on the # next normal run, there will be no work to do unless you modify _ftp_dirlist.txt. # this allows forcing only specific files to be downloaded. if self.config.do('ftp_catchup'): cur_ftp_files = [] self._local_files.clear() self._ftp.retrlines('LIST', cur_ftp_files.append) for entry in cur_ftp_files: finfo = u.ftp_metadata(entry) if finfo['isdir']: continue finfo['modified'] = self.ftp_modified(finfo['name']) finfo['path'] = self._dest_root finfo['full'] = self._dest_root + '/' + finfo['name'] self._local_files[finfo['name']] = finfo self.write_metadata() return 0 # remove or modify persisted file metadata if full path matches passed regex if self.config.do('ftp_remove') or self.config.do('im_rerun'): regex = re.compile(self.config.special_mode_args[1]) def test(fi): return re.search(regex, fi['full']) self.read_metadata(test=test, action=self.config.special_mode_args[0]) self.write_metadata() return 0 # build metadata file from what's in input dir if self.config.do('im_meta_from_local'): self.metadata_from_local(clear_first=True) return 0 # normal operation: self.read_metadata() self.metadata_from_local(clear_first=False) cur_ftp_files = [] self._ftp.retrlines('LIST', cur_ftp_files.append) for entry in cur_ftp_files: if self.config.signalled(): logging.info("signal set, leaving ftp.update_products") break finfo = u.ftp_metadata(entry) file_name = finfo['name'] if finfo['isdir']: continue # test include/exclude rules found = False for reg in self._re_include: if re.search(reg, file_name): found = True break if not found: Config.log("skipping file '%s'" % file_name, tag='FTP_INCLUDE_FILES') continue found = False for reg in self._re_exclude: if re.search(reg, file_name): found = True break if found: Config.log("skipping file '%s'" % file_name, tag='FTP_EXCLUDE_FILES') continue finfo['modified'] = self.ftp_modified(finfo['name']) local_full = self._dest_root + '/' + file_name grabit = True if file_name in self._local_files: local_finfo = self._local_files[file_name] # download if ftp version is newer than our last download localmod = local_finfo['modified'] remotemod = finfo['modified'] grabit = remotemod > localmod if grabit: logging.info("grabbing new/changed file %s" % file_name) fh = open(local_full, "wb") self._ftp.retrbinary('RETR ' + file_name, fh.write) fh.close() finfo['path'] = self._dest_root finfo['full'] = local_full self._local_files[file_name] = finfo self._download_count += 1 msg = "FTP downloaded file %i of limit %i" % ( self._download_count, self._download_limit) logging.info(msg) if self._download_limit > 0 and self._download_count >= self._download_limit: msg = "downloaded limit of %i files, ending download phase" % self._download_count self.config.add_to_final_summary(msg) logging.warning(msg) break self._ftp.quit() self._ftp = None self.write_metadata() elapsed = u.timestamp_now() - start logging.info( "FTP.update_products finished in %f seconds, downloaded %i files" % (elapsed, self._download_count)) return self._download_count
def sync_to_upstream_dest_mgr(self, upstream, refresh_me, refresh_upstream, tmp_folder, fixer=None): msg = "refresh_me = %s, refresh_upstream = %s, upstream root = '%s', my root = '%s', tmp_folder = %s" % \ (refresh_me, refresh_upstream, upstream._file_dest_root, self._file_dest_root, tmp_folder) Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_STARTING') do_refresh = {'refresh_dest_meta': True} dont_refresh = {'refresh_dest_meta': False} smart_refresh = {'refresh_dest_meta': True, 'skip_refresh_if_tree_unchanged': True} if refresh_me == 'full': self.sync_tree_info(options=do_refresh) elif refresh_me == 'smart': self.sync_tree_info(options=smart_refresh) else: self.sync_tree_info(options=dont_refresh) if refresh_upstream == 'full': upstream.sync_tree_info(options=do_refresh) elif refresh_upstream == 'smart': upstream.sync_tree_info(options=smart_refresh) else: upstream.sync_tree_info(options=dont_refresh) u.clear_folder(tmp_folder) start = u.timestamp_now() for key, finfo in upstream.tree_info_items(): src = os.path.join(upstream._file_dest_root, key) if not os.path.exists(src): msg = "file '%s' does not exist" % src Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_METADATA ERROR') continue if self.config.is_template_type(finfo['name']) and fixer: # copy and fix up dest = os.path.join(tmp_folder, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) fixer(dest) # make new metadata and copy to self newfi = copy.deepcopy(finfo) path, file = os.path.split(dest) local_meta = u.local_metadata(path, file) newfi['size'] = local_meta['size'] newfi['modified'] = local_meta['modified'] newfi['md5'] = u.md5(dest) self.tree_info[key] = newfi src = dest dest = os.path.join(self._file_dest_root, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) # we could remove fixed-up file now, but clear_folder at end probably faster else: # file not subject to fixup. just copy if missing/older/diff size copyit = False if key in self.tree_info: # compare metadata and see whether to copy myfi = self.tree_info[key] if myfi['md5'] != finfo['md5'] or \ myfi['modified'] < finfo['modified'] or \ myfi['size'] != finfo['size']: copyit = True else: copyit = True if copyit: # REVIEW - deepcopy probably safe here because we're copying from # one dest mgr to another self.tree_info[key] = copy.deepcopy(finfo) dest = os.path.join(self._file_dest_root, key) u.ensure_path_for_file(dest) shutil.copyfile(src, dest) # delete from me if not in upstream to_delete = {} for key, finfo in self.tree_info_items(): if key not in upstream.tree_info: to_delete[key] = os.path.join(self._file_dest_root, finfo['key']) for key, full in to_delete.items(): os.remove(full) del self.tree_info[key] self.write_tree_info() # this is a space-saving move, but should be small, and might # be handy to have files around for debug. could be a config option. # u.clear_folder(tmp_folder) elapsed = u.timestamp_now() - start msg = "done, elapsed %f seconds" % elapsed Config.log(msg, tag='FILE_DEST_SYNC_TO_UPSTREAM_FINISHED')
def sync_tree_info(self, options=None): if self._synced_tree_info: Config.log(self.config_section, tag='FILE_DEST_META_ALREADY_SYNCED') return start = u.timestamp_now() logging.info("starting FileDest metadata sync") if options is None: options = self._default_sync_options # need to read persisted file, as that's the only place non-file-system metadata can live self.read_tree_info() # determine whether to force full refresh, only do it if tree has changed, on simply trust metadata: do_full_refresh = False if 'refresh_dest_meta' in options: do_full_refresh = options['refresh_dest_meta'] # computing all those md5s takes a long time, so optionally skip it if the most # recent modification time for _file_dest_root is unchanged since we last did it if 'skip_refresh_if_tree_unchanged' in options: last_mod = u.dir_last_modified(self._file_dest_root) expected_last_mod = self._tree_last_modified do_full_refresh = last_mod != expected_last_mod msg = "last_mod do_full_refresh = '%s', last_mod = '%f', expected_last_mod = '%f'" % ( do_full_refresh, last_mod, expected_last_mod) Config.log(msg, tag='FILE_DEST_META_TREE_UNCHANGED_TEST') if do_full_refresh: # physically walk the tree as it might not match persisted data for dir_name, subdirs, files in os.walk(self._file_dest_root): for file_name in files: full_src = dir_name + '/' + file_name setit = False rel_path = u.make_rel_path(self._file_dest_root, dir_name, strict=False, no_leading_slash=True) key = u.make_key(rel_path, file_name) local_meta = u.local_metadata(dir_name, file_name) local_meta['md5'] = u.md5(full_src) if key in self.tree_info: saved_meta = self.tree_info[key] if 'md5' not in saved_meta: saved_meta['md5'] = 'ERROR! md5 MISSING FROM tree_info!' if local_meta['md5'] == saved_meta['md5']: # sanity check if local_meta['size'] != saved_meta['size']: msg = "key '%s', saved: size %i, read: size %i" % ( key, saved_meta['size'], local_meta['size']) Config.log(msg, tag='FILE_DEST_META_ERROR_NONFATAL') # otherwise file is perfect, continue else: msg = "key '%s', md5 mismatch. saved: '%s', read: '%s'" % ( key, saved_meta['md5'], local_meta['md5']) Config.log(msg, tag='FILE_DEST_META_ERROR_FATAL') setit = True else: msg = "key '%s' not found in saved, adding" % key Config.log(msg, tag='FILE_DEST_META_NEW_FILE') setit = True if setit: local_meta['key'] = key self.tree_info[key] = local_meta # important: must never expose 'full' outside this class - it's a private # implementation detail. Same for 'path'. Only 'key' is public del local_meta['full'] del local_meta['path'] self.tree_info[key] = local_meta self.tree_info[key]['_found_file_'] = True missing = [] for key in self.tree_info: if '_found_file_' in self.tree_info[key]: del self.tree_info[key]['_found_file_'] else: missing.append(key) msg = "no file matching key '%s', deleting" % key Config.log(msg, tag='FILE_DEST_META_MISSING') for key in missing: del self.tree_info[key] self.write_tree_info() act = "completed" else: # trust the persisted file (faster) act = "bypassed" elapsed = u.timestamp_now() - start msg = "%s confirmation of tree info in %f seconds" % (act, elapsed) Config.log(msg, tag='FILE_DEST_SYNC') self._synced_tree_info = True
def upload_tree(self, local_root, remote_root='', options=None, local_tree_meta=None): logging.info("starting FileDest upload") if not options: options = {'use_md5': True} start = u.timestamp_now() self._upload_count = 0 # refresh and save data for files already on dest self.sync_tree_info(options=self._default_sync_options) for dir_name, subdirs, files in os.walk(local_root): rel_path = u.make_rel_path(local_root, dir_name) if not rel_path.startswith('/tmp'): for file_name in files: local_file = dir_name + '/' + file_name key = u.make_key(rel_path, file_name) local_md5 = u.md5(local_file) local_meta = None if local_tree_meta and local_file in local_tree_meta: local_meta = local_tree_meta[local_file] else: local_meta = u.local_metadata(dir_name, file_name) size = local_meta['size'] cached_info = None if key in self.tree_info: cached_info = self.tree_info[key] do_upload = True if 'use_md5' in options: if cached_info and not self.is_pending(key): if 'md5' in self.tree_info[key]: remote_md5 = self.tree_info[key]['md5'] do_upload = do_upload and remote_md5 != local_md5 else: err = "no md5 value for existing key '%s' (old version?)" % key logging.error(err) else: Config.log("file '%s' is not in FileDest" % key, tag='DEST_NO_EXISTING_FILE') if self._max_upload_size >= 0 and size > self._max_upload_size: logging.debug("file '%s' size (%i) > limit (%i), won't upload" % (key, size, self._max_upload_size)) do_upload = False if do_upload: extra_args = { 'Metadata': {'md5': local_md5} } logging.debug("FileDest object upload starting, key = '%s', %i bytes" % (key, size)) start = u.timestamp_now() self._upload(dir_name, file_name, key, extra_args=extra_args) rate = size / (u.timestamp_now() - start) Config.log("key = '%s', %f bytes/sec" % (key, rate), tag='FILE_DEST_UPLOAD_OK') # add metadata to our repos info = { 'new': True, 'name': file_name, 'rel_path': rel_path, 'key': key, 'size': local_meta['size'], 'modified': local_meta['modified'], # 'mod_dt': last_mod, # 'e_tag': obj.e_tag, 'md5': local_md5 } # transfer meta (e.g. thumbnail info) if exists if local_tree_meta and local_file in local_tree_meta: self.transfer_metadata(local_tree_meta[local_file], local_root=self.local_root, dest=info) self.tree_info[key] = info self._upload_count += 1 else: Config.log("key = '%s'" % key, tag='FILE_DEST_UPLOAD_NO_CHANGE') self.write_tree_info() elapsed = u.timestamp_now() - start logging.info("FileDest.upload_tree finished in %f seconds, uploaded %i files" % (elapsed, self._upload_count)) return self._upload_count
from util import Timer, timestamp_now INPUT_PATH = TREEBANK_DATA_PATH #'../treebank_data/00/ann_0001.parse'# OUTPUT_PATH = '../reports/' # Must be directory; Filename auto-generated def timestamped_file_path(filename, timestamp): return normpath(join( OUTPUT_PATH, timestamp, filename )) ############################################################################### if __name__ == '__main__': timer = Timer() nowstamp = timestamp_now() mkdir(normpath(join(OUTPUT_PATH, nowstamp))) csvpath = timestamped_file_path('verbs.csv', nowstamp) pd_report_path = timestamped_file_path('pro-drop report.txt', nowstamp) npd_report_path = timestamped_file_path('non-pro-drop report.txt', nowstamp) with timer: ca = CombinedAnalyzer(INPUT_PATH) ca.do_analysis() ca.print_report_basic() with open(pd_report_path, 'w', encoding='utf8') as pdout: with open(npd_report_path, 'w', encoding='utf8') as npdout: ca.write_report_full(pdout, npdout)