def stamp_hash(self, h5file_name, new_hash): '''Loads a file, stamps it, and returns the opened file in read only''' h5file = h5io.WESTPAH5File(h5file_name, 'r+') h5file.attrs['arg_hash'] = new_hash h5file.close() h5file = h5io.WESTPAH5File(h5file_name, 'r') return h5file
def open_files(self): self.output_file = h5io.WESTPAH5File(self.output_filename, 'a', creating_program=True) h5io.stamp_creator_data(self.output_file) self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False) self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False) if not self.iter_range.check_data_iter_range_least(self.assignments_file): raise ValueError('assignments data do not span the requested iterations')
def open_files(self): self.output_file = h5io.WESTPAH5File(self.output_file, 'w', creating_program=True) h5io.stamp_creator_data(self.output_file) opened_files = self.generate_file_list([self.west]) self.westH5 = opened_files[self.west]
def generate_file_list(self, key_list): '''A convenience function which takes in a list of keys that are filenames, and returns a dictionary which contains all the individual files loaded inside of a dictionary keyed to the filename.''' return_dict = {} if self.ntrials == 0: raise self.NoSimulationsException('You must specify the number of simulations.') for key in key_list: return_dict[key] = {} for i in range(1, self.ntrials + 1): # Need to not make this hard coded, but who cares for now. for key in key_list: return_dict[key][i] = h5io.WESTPAH5File(os.path.join(self.master, str(i).zfill(2), key), 'r') return return_dict
def _find_matching_segments(west_datafile_name, n_iter, predicate, invert=False): '''Find all segments in iteration ``n_iter`` that match (or do not match, if ``invert`` is true) the given ``predicate``. Returns a sequence of matching seg_ids.''' with h5io.WESTPAH5File(west_datafile_name, 'r') as west_datafile: iter_group = west_datafile.get_iter_group(n_iter) nsegs = iter_group['seg_index'].shape[0] matching_ids = set(map(int, predicate(n_iter, iter_group))) if invert: matching_ids = set(range(nsegs)) - matching_ids matchvec = sorted(np.fromiter(matching_ids, dtype=seg_id_dtype, count=len(matching_ids))) return n_iter, matchvec
def process_args(self, args): self.progress.process_args(args) self.kinetics_filename = args.kinetics self.istate = args.istate self.fstate = args.fstate self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r') self.iter_start = args.iter_start if args.iter_stop is None: self.iter_stop = self.kinetics_file.attrs['iter_stop'] else: self.iter_stop = args.iter_stop + 1 self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False
def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=np.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = np.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + np.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = np.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = np.argsort(weights)[:count] elif what == 'highweight': indices = np.argsort(weights)[::-1][:count] else: assert what == 'random' indices = np.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=np.uint, shape=(iter_count, )) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype), shuffle=True, compression=9, ) weights_ds = output_file.create_dataset( 'weights', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=weight_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), weight_dtype), shuffle=True, compression=9, ) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_find_matching_segments, (self.data_reader.we_h5filename, n_iter, self.predicate, self.invert), {}) for n_iter in range(iter_start, iter_stop)), self.max_queue_len, ): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches seg_count_ds[n_iter - iter_start] = n_matches matching_segs_ds[n_iter - iter_start, :n_matches] = matching_ids weights_ds[n_iter - iter_start, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in range(iter_stop - 1, iter_start - 1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update( matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter, :n_matches] = matching_ids weights_ds[ iiter, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)[ 'seg_index']['parent_id'][sorted(matching_ids)] from_previous.update( parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1
def analysis_structure(self): ''' Run automatically on startup. Parses through the configuration file, and loads up all the data files from the different analysis schematics. If they don't exist, it creates them automatically by hooking in to existing analysis routines and going from there. It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list of args. The user can specify everything in the configuration file that would have been specified on the command line. For instance, were one to call w_direct as follows: w_direct --evolution cumulative --step-iter 1 --disable-correl the west.cfg would look as follows: west: analysis: w_direct: evolution: cumulative step_iter: 1 extra: ['disable-correl'] Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced with 'kinetics'. ''' # Make sure everything exists. try: os.mkdir(self.__settings['directory']) except Exception: pass # Now, check to see whether they exist, and then load them. self.__analysis_schemes__ = {} # We really need to implement some sort of default behavior if an analysis scheme isn't set. # Right now, we just crash. That isn't really graceful. for scheme in self.__settings['analysis_schemes']: if self.__settings['analysis_schemes'][scheme]['enabled']: if self.work_manager.running is False: self.work_manager.startup() path = os.path.join(os.getcwd(), self.__settings['directory'], scheme) # if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']: # Should clean this up. But it uses the default global setting if a by-scheme one isn't set. if 'postanalysis' in self.__settings: if 'postanalysis' in self.__settings['analysis_schemes'][ scheme]: pass else: self.__settings['analysis_schemes'][scheme][ 'postanalysis'] = self.__settings['postanalysis'] try: os.mkdir(path) except Exception: pass self.__analysis_schemes__[scheme] = {} try: if (self.__settings['analysis_schemes'][scheme] ['postanalysis'] is True or self.__settings['postanalysis'] is True): analysis_files = ['assign', 'direct', 'reweight'] else: analysis_files = ['assign', 'direct'] except Exception: analysis_files = ['assign', 'direct'] self.__settings['analysis_schemes'][scheme][ 'postanalysis'] = False reanalyze_kinetics = False assign_hash = None for name in analysis_files: arg_hash = None if self.reanalyze is True: reanalyze_kinetics = True try: os.remove(os.path.join(path, '{}.h5'.format(name))) except Exception: pass else: try: # Try to load the hash. If we fail to load the hash or the file, we need to reload. # if self.reanalyze == True: # raise ValueError('Reanalyze set to true.') self.__analysis_schemes__[scheme][ name] = h5io.WESTPAH5File( os.path.join(path, '{}.h5'.format(name)), 'r') arg_hash = self.__analysis_schemes__[scheme][ name].attrs['arg_hash'] if name == 'assign': assign_hash = arg_hash except Exception: pass # We shouldn't rely on this. # self.reanalyze = True if True: if name == 'assign': assign = w_assign.WAssign() w_assign_config = { 'output': os.path.join(path, '{}.h5'.format(name)) } try: w_assign_config.update( self.__settings['w_assign']) except Exception: pass try: w_assign_config.update( self.__settings['analysis_schemes'][scheme] ['w_assign']) except Exception: pass args = [] for key, value in w_assign_config.items(): if key != 'extra': args.append( str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(w_assign_config.keys()): # We're sorting to ensure that the order doesn't matter. for value in sorted(w_assign_config['extra']): args.append( str('--') + str(value).replace('_', '-')) # We're just calling the built in function. # This is a lot cleaner than what we had in before, and far more workable. args.append('--config-from-file') args.append('--scheme-name') args.append('{}'.format(scheme)) # Why are we calling this if we're not sure we're remaking the file? # We need to load up the bin mapper and states and see if they're the same. assign.make_parser_and_process(args=args) import pickle # new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states]) # We need to encode it properly to ensure that some OS specific thing doesn't kill us. Same goes for the args, ultimately. # Mostly, we just need to ensure that we're consistent. new_hash = self.hash_args( args=args, path=path, extra=[ int(self.niters), codecs.encode( pickle.dumps(assign.binning.mapper), "base64"), base64.b64encode( str(assign.states).encode()), ], ) # Let's check the hash. If the hash is the same, we don't need to reload. if self.debug_mode is True: print('{:<10}: old hash, new hash -- {}, {}'. format(name, arg_hash, new_hash)) if self.ignore_hash is False and ( arg_hash != new_hash or self.reanalyze is True): # If the hashes are different, or we need to reanalyze, delete the file. try: os.remove( os.path.join(path, '{}.h5'.format(name))) except Exception: pass print('Reanalyzing file {}.h5 for scheme {}.'. format(name, scheme)) # reanalyze_kinetics = True # We want to use the work manager we have here. Otherwise, just let the tool sort out what it needs, honestly. assign.work_manager = self.work_manager assign.go() assign.data_reader.close() # Stamp w/ hash, then reload as read only. self.__analysis_schemes__[scheme][ name] = self.stamp_hash( os.path.join(path, '{}.h5'.format(name)), new_hash) del assign # Update the assignment hash. assign_hash = new_hash # Since these are all contained within one tool, now, we want it to just... load everything. if name == 'direct' or name == 'reweight': if name == 'direct': analysis = w_direct.WDirect() if name == 'reweight': analysis = w_reweight.WReweight() analysis_config = { 'assignments': os.path.join(path, '{}.h5'.format('assign')), 'output': os.path.join(path, '{}.h5'.format(name)), 'kinetics': os.path.join(path, '{}.h5'.format(name)), } # Pull from general analysis options, then general SPECIFIC options for each analysis, # then general options for that analysis scheme, then specific options for the analysis type in the scheme. try: analysis_config.update( self.__settings['kinetics']) except Exception: pass try: analysis_config.update( self.__settings['w_{}'.format(name)]) except Exception: pass try: analysis_config.update( self.__settings['analysis_schemes'][scheme] ['kinetics']) except Exception: pass try: analysis_config.update( self.__settings['analysis_schemes'][scheme] ['w_{}'.format(name)]) except Exception: pass # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in. # The tool has no real idea it's being called outside of its actual function, and we're good to go. args = ['all'] for key, value in analysis_config.items(): if key != 'extra': args.append( str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(analysis_config.keys()): for value in sorted(analysis_config['extra']): args.append( str('--') + str(value).replace('_', '-')) # We want to not display the averages, so... args.append('--disable-averages') new_hash = self.hash_args( args=args, path=path, extra=[int(self.niters), assign_hash]) # if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True: if self.debug_mode is True: print('{:<10}: old hash, new hash -- {}, {}'. format(name, arg_hash, new_hash)) if self.ignore_hash is False and ( arg_hash != new_hash or reanalyze_kinetics is True): try: os.remove( os.path.join(path, '{}.h5'.format(name))) except Exception: pass print('Reanalyzing file {}.h5 for scheme {}.'. format(name, scheme)) analysis.make_parser_and_process(args=args) # We want to hook into the existing work manager. analysis.work_manager = self.work_manager analysis.go() # Open! self.__analysis_schemes__[scheme][ name] = self.stamp_hash( os.path.join(path, '{}.h5'.format(name)), new_hash) del analysis # Make sure this doesn't get too far out, here. We need to keep it alive as long as we're actually analyzing things. # self.work_manager.shutdown() print("") print("Complete!")
from westpa import rc from westpa.core import h5io data_manager = rc.get_data_manager() # Store west.h5 file in RAM for testing west_file_name = 'west.h5' west_file = h5io.WESTPAH5File(west_file_name, driver='core', backing_store=False) data_manager.we_h5file = west_file data_manager.we_h5file_version = int(west_file['/'].attrs.get( 'west_file_format_version', 0))
def process_args(self, args): self.output_file = h5io.WESTPAH5File(args.output, 'w', creating_program=True) self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r') # Force a build of the transition matrix at the iteration level. self.sampling_frequency = 'iteration' if self.assignments_file.attrs['subsampled'] == True else args.sampling_frequency