class WESTKineticsBase(WESTSubcommand): ''' Common argument processing for w_direct/w_reweight subcommands. Mostly limited to handling input and output from w_assign. ''' def __init__(self, parent): super(WESTKineticsBase,self).__init__(parent) self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_filename = None # This is actually applicable to both. self.assignment_filename = None self.output_file = None self.assignments_file = None self.evolution_mode = None self.mcbs_alpha = None self.mcbs_acalpha = None self.mcbs_nsets = None # Now we're adding in things that come from the old w_kinetics self.do_compression = True def add_args(self, parser): self.progress.add_args(parser) self.data_reader.add_args(parser) self.iter_range.include_args['iter_step'] = True self.iter_range.add_args(parser) iogroup = parser.add_argument_group('input/output options') iogroup.add_argument('-a', '--assignments', default='assign.h5', help='''Bin assignments and macrostate definitions are in ASSIGNMENTS (default: %(default)s).''') iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file, help='''Store results in OUTPUT (default: %(default)s).''') def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args, default_iter_step=None) if self.iter_range.iter_step is None: #use about 10 blocks by default self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10) self.output_filename = args.output self.assignments_filename = args.assignments
class WFluxanlTool(WESTTool): prog = 'w_fluxanl' description = '''\ Extract fluxes into pre-defined target states from WEST data, average, and construct confidence intervals. Monte Carlo bootstrapping is used to account for the correlated and possibly non-Gaussian statistical error in flux measurements. All non-graphical output (including that to the terminal and HDF5) assumes that the propagation/resampling period ``tau`` is equal to unity; to obtain results in familiar units, divide all fluxes and multiply all correlation lengths by the true value of ``tau``. ''' output_format_version = 2 def __init__(self): super(WFluxanlTool, self).__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.output_h5file = None self.output_group = None self.target_groups = {} self.fluxdata = {} self.alpha = None self.autocorrel_alpha = None self.n_sets = None self.do_evol = False self.evol_step = 1 def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='fluxanl.h5', help= 'Store intermediate data and analysis results to OUTPUT (default: %(default)s).' ) cgroup = parser.add_argument_group('calculation options') cgroup.add_argument( '--disable-bootstrap', '-db', dest='bootstrap', action='store_const', const=False, help='''Enable the use of Monte Carlo Block Bootstrapping.''') cgroup.add_argument('--disable-correl', '-dc', dest='correl', action='store_const', const=False, help='''Disable the correlation analysis.''') cgroup.add_argument( '-a', '--alpha', type=float, default=0.05, help= '''Calculate a (1-ALPHA) confidence interval on the average flux' (default: %(default)s)''') cgroup.add_argument( '--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA', help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance. Note that too small an ACALPHA will result in failure to detect autocorrelation in a noisy flux signal. (Default: same as ALPHA.)''' ) cgroup.add_argument( '-N', '--nsets', type=int, help= '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''' ) cgroup.add_argument( '--evol', action='store_true', dest='do_evol', help= '''Calculate time evolution of flux confidence intervals (expensive).''' ) cgroup.add_argument( '--evol-step', type=int, default=1, metavar='ESTEP', help= '''Calculate time evolution of flux confidence intervals every ESTEP iterations (default: %(default)s)''') def process_args(self, args): self.data_reader.process_args(args) self.data_reader.open() self.iter_range.data_manager = self.data_reader self.iter_range.process_args(args) self.output_h5file = h5py.File(args.output, 'w') self.alpha = args.alpha # Disable the bootstrap or the correlation analysis. self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True self.do_correl = args.correl if args.correl is not None else True self.autocorrel_alpha = args.acalpha or self.alpha self.n_sets = args.nsets or mclib.get_bssize(self.alpha) self.do_evol = args.do_evol self.evol_step = args.evol_step or 1 def calc_store_flux_data(self): westpa.rc.pstatus( 'Calculating mean flux and confidence intervals for iterations [{},{})' .format(self.iter_range.iter_start, self.iter_range.iter_stop)) fluxdata = extract_fluxes(self.iter_range.iter_start, self.iter_range.iter_stop, self.data_reader) # Create a group to store data in output_group = h5io.create_hdf5_group(self.output_h5file, 'target_flux', replace=False, creating_program=self.prog) self.output_group = output_group output_group.attrs['version_code'] = self.output_format_version self.iter_range.record_data_iter_range(output_group) n_targets = len(fluxdata) index = np.empty((len(fluxdata), ), dtype=target_index_dtype) avg_fluxdata = np.empty((n_targets, ), dtype=ci_dtype) for itarget, (target_label, target_fluxdata) in enumerate(fluxdata.items()): # Create group and index entry index[itarget]['target_label'] = str(target_label) target_group = output_group.create_group( 'target_{}'.format(itarget)) self.target_groups[target_label] = target_group # Store per-iteration values target_group['n_iter'] = target_fluxdata['n_iter'] target_group['count'] = target_fluxdata['count'] target_group['flux'] = target_fluxdata['flux'] h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1']) # Calculate flux autocorrelation fluxes = target_fluxdata['flux'] mean_flux = fluxes.mean() fmm = fluxes - mean_flux acorr = fftconvolve(fmm, fmm[::-1]) acorr = acorr[len(acorr) // 2:] acorr /= acorr[0] acorr_ds = target_group.create_dataset('flux_autocorrel', data=acorr) h5io.label_axes(acorr_ds, ['lag'], ['tau']) # Calculate overall averages and CIs #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, np.mean, self.alpha, self.n_sets, # autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean) avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl( {'dataset': fluxes}, estimator=(lambda stride, dataset: np.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets, autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable) avg_fluxdata[itarget] = (self.iter_range.iter_start, self.iter_range.iter_stop, avg, lb_ci, ub_ci, sterr, correl_len) westpa.rc.pstatus('target {!r}:'.format(target_label)) westpa.rc.pstatus( ' correlation length = {} tau'.format(correl_len)) westpa.rc.pstatus( ' mean flux and CI = {:e} ({:e},{:e}) tau^(-1)'.format( avg, lb_ci, ub_ci)) index[itarget]['mean_flux'] = avg index[itarget]['mean_flux_ci_lb'] = lb_ci index[itarget]['mean_flux_ci_ub'] = ub_ci index[itarget]['mean_flux_correl_len'] = correl_len # Write index and summary index_ds = output_group.create_dataset('index', data=index) index_ds.attrs['mcbs_alpha'] = self.alpha index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha index_ds.attrs['mcbs_n_sets'] = self.n_sets self.fluxdata = fluxdata self.output_h5file['avg_flux'] = avg_fluxdata def calc_evol_flux(self): westpa.rc.pstatus( 'Calculating cumulative evolution of flux confidence intervals every {} iteration(s)' .format(self.evol_step)) for itarget, (target_label, target_fluxdata) in enumerate(self.fluxdata.items()): fluxes = target_fluxdata['flux'] target_group = self.target_groups[target_label] iter_start = target_group['n_iter'][0] iter_stop = target_group['n_iter'][-1] iter_count = iter_stop - iter_start n_blocks = iter_count // self.evol_step if iter_count % self.evol_step > 0: n_blocks += 1 cis = np.empty((n_blocks, ), dtype=ci_dtype) for iblock in range(n_blocks): block_iter_stop = min( iter_start + (iblock + 1) * self.evol_step, iter_stop) istop = min((iblock + 1) * self.evol_step, len(target_fluxdata['flux'])) fluxes = target_fluxdata['flux'][:istop] #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, np.mean, self.alpha, self.n_sets, # autocorrel_alpha = self.autocorrel_alpha, # subsample=np.mean) avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl( {'dataset': fluxes}, estimator=(lambda stride, dataset: np.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets, autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable) cis[iblock]['iter_start'] = iter_start cis[iblock]['iter_stop'] = block_iter_stop cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock][ 'ci_ubound'] = avg, ci_lb, ci_ub cis[iblock]['corr_len'] = correl_len cis[iblock]['sterr'] = sterr del fluxes cis_ds = target_group.create_dataset('flux_evolution', data=cis) cis_ds.attrs['iter_step'] = self.evol_step cis_ds.attrs['mcbs_alpha'] = self.alpha cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha cis_ds.attrs['mcbs_n_sets'] = self.n_sets def go(self): self.calc_store_flux_data() if self.do_evol: self.calc_evol_flux()
class WBinTool(WESTTool): prog = 'w_bins' description = '''\ Display information and statistics about binning in a WEST simulation, or modify the binning for the current iteration of a WEST simulation. ------------------------------------------------------------------------------- ''' def __init__(self): super().__init__() self.subcommand = None self.data_reader = WESTDataReader() self.binning = BinMappingComponent() self.args = None self.n_iter = None # Interface for command-line tools def add_args(self, parser): self.data_reader.add_args(parser) subparsers = parser.add_subparsers(help='available commands') info_parser = subparsers.add_parser( 'info', help='Display information about binning.') info_parser.add_argument( '-n', '--n-iter', type=int, help= '''Consider initial points of segment N_ITER (default: current iteration).''' ) info_parser.add_argument( '--detail', action='store_true', help='''Display detailed per-bin information in addition to summary information.''', ) self.binning.add_args(info_parser) info_parser.set_defaults(func=self.cmd_info) rebin_parser = subparsers.add_parser( 'rebin', help='Rebuild current iteration with new binning.') rebin_parser.add_argument( '--confirm', action='store_true', help= '''Commit the revised iteration to HDF5; without this option, the effects of the new binning are only calculated and printed.''', ) rebin_parser.add_argument( '--detail', action='store_true', help='''Display detailed per-bin information in addition to summary information.''', ) rebin_parser.add_argument( '-n', '--n-iter', type=int, help= '''Consider initial points of segment N_ITER (default: current iteration).''' ) self.binning.add_args(rebin_parser, suppress=['--bins-from-file']) self.binning.add_target_count_args(rebin_parser) rebin_parser.set_defaults(func=self.cmd_rebin) def process_args(self, args): self.data_reader.process_args(args) self.data_reader.open(mode='r+') self.n_iter = getattr(args, 'n_iter', None) or self.data_reader.current_iteration # we cannot read bin information during rebins # interesting note: '==' is required here; 'is' fails if args.func == self.cmd_rebin: self.binning.target_counts_required = True else: self.binning.set_we_h5file_info(self.n_iter, self.data_reader) self.binning.process_args(args) self.args = args self.subcommand = args.func def go(self): self.subcommand() def cmd_info(self): mapper = self.binning.mapper # Get target states and their assignments target_states = self.data_reader.get_target_states(self.n_iter) n_target_states = len(target_states) iter_group = self.data_reader.get_iter_group(self.n_iter) # bin initial pcoords for iteration n_iter initial_pcoords = iter_group['pcoord'][:, 0, :] assignments = mapper.assign(initial_pcoords) del initial_pcoords print('Bin information for iteration {:d}'.format(self.n_iter)) # Get bin counts and weights weights = iter_group['seg_index']['weight'] write_bin_info(mapper, assignments, weights, n_target_states, detailed=self.args.detail) def cmd_rebin(self): mapper = self.binning.mapper assert mapper is not None if self.n_iter == 1: sys.stderr.write( 'rebin is not supported for the first iteration; reinitialize with w_init instead\n' ) sys.exit(1) n_target_states = len(self.data_reader.get_target_states(self.n_iter)) we_driver = westpa.rc.get_we_driver() data_manager = self.data_reader.data_manager segments = data_manager.get_segments(self.n_iter, load_pcoords=True) last_iter_segments = data_manager.get_segments(self.n_iter - 1, load_pcoords=False) # Bin on this iteration's initial points # We don't have to worry about recycling because we are binning on # initial points rather than final points, so recycling has already # occurred for this iteration. # We do need initial states, in case we merge a newly-created walker out of existence # avail_initial_states = {state.state_id: state # for state in data_manager.get_unused_initial_states(n_iter = self.n_iter)} avail_initial_states = data_manager.get_unused_initial_states( n_iter=self.n_iter) used_initial_states = data_manager.get_segment_initial_states(segments) we_driver.new_iteration( initial_states=avail_initial_states, bin_mapper=mapper, bin_target_counts=self.binning.bin_target_counts) we_driver.used_initial_states = { state.state_id: state for state in used_initial_states } we_driver.assign(segments, initializing=True) we_driver.rebin_current(parent_segments=last_iter_segments) weights = np.array( [segment.weight for segment in we_driver.next_iter_segments]) assignments = np.fromiter(we_driver.next_iter_assignments, dtype=int, count=len(weights)) write_bin_info(mapper, assignments, weights, n_target_states, detailed=self.args.detail) if self.args.confirm: data_manager.prepare_iteration(self.n_iter, list(we_driver.next_iter_segments)) # manually update endpoint statuses only endpoint_types = sorted([(segment.seg_id, segment.endpoint_type) for segment in last_iter_segments]) last_iter_group = data_manager.get_iter_group(self.n_iter - 1) last_iter_index = last_iter_group['seg_index'][...] last_iter_index['endpoint_type'] = [ pair[1] for pair in endpoint_types ] last_iter_group['seg_index'][...] = last_iter_index data_manager.save_iter_binning(self.n_iter, self.binning.mapper_hash, self.binning.mapper_pickle, we_driver.bin_target_counts) data_manager.update_initial_states(we_driver.all_initial_states) data_manager.flush_backing()
class WTraceTool(WESTTool): prog = 'w_trace' description = '''\ Trace individual WEST trajectories and emit (or calculate) quantities along the trajectory. Trajectories are specified as N_ITER:SEG_ID pairs. Each segment is traced back to its initial point, and then various quantities (notably n_iter and seg_id) are printed in order from initial point up until the given segment in the given iteration. Output is stored in several files, all named according to the pattern given by the -o/--output-pattern parameter. The default output pattern is "traj_%d_%d", where the printf-style format codes are replaced by the iteration number and segment ID of the terminal segment of the trajectory being traced. Individual datasets can be selected for writing using the -d/--dataset option (which may be specified more than once). The simplest form is ``-d dsname``, which causes data from dataset ``dsname`` along the trace to be stored to HDF5. The dataset is assumed to be stored on a per-iteration basis, with the first dimension corresponding to seg_id and the second dimension corresponding to time within the segment. Further options are specified as comma-separated key=value pairs after the data set name, as in -d dsname,alias=newname,index=idsname,file=otherfile.h5,slice=[100,...] The following options for datasets are supported: alias=newname When writing this data to HDF5 or text files, use ``newname`` instead of ``dsname`` to identify the dataset. This is mostly of use in conjunction with the ``slice`` option in order, e.g., to retrieve two different slices of a dataset and store then with different names for future use. index=idsname The dataset is not stored on a per-iteration basis for all segments, but instead is stored as a single dataset whose first dimension indexes n_iter/seg_id pairs. The index to these n_iter/seg_id pairs is ``idsname``. file=otherfile.h5 Instead of reading data from the main WEST HDF5 file (usually ``west.h5``), read data from ``otherfile.h5``. slice=[100,...] Retrieve only the given slice from the dataset. This can be used to pick a subset of interest to minimize I/O. ------------------------------------------------------------------------------- ''' pcoord_formats = { 'u8': '%20d', 'i8': '%20d', 'u4': '%10d', 'i4': '%11d', 'u2': '%5d', 'i2': '%6d', 'f4': '%14.7g', 'f8': '%023.15g' } def __init__(self): super(WTraceTool, self).__init__() self.data_reader = WESTDataReader() #self.h5storage = HDF5Storage() self.output_file = None self.output_pattern = None self.endpoints = None self.datasets = [] # Interface for command-line tools def add_args(self, parser): self.data_reader.add_args(parser) #self.h5storage.add_args(parser) parser.add_argument( '-d', '--dataset', dest='datasets', #this breaks argparse (see http://bugs.python.org/issue11874) #metavar='DSNAME[,alias=ALIAS][,index=INDEX][,file=FILE][,slice=SLICE]', metavar='DSNAME', action='append', help= '''Include the dataset named DSNAME in trace output. An extended form like DSNAME[,alias=ALIAS][,index=INDEX][,file=FILE][,slice=SLICE] will obtain the dataset from the given FILE instead of the main WEST HDF5 file, slice it by SLICE, call it ALIAS in output, and/or access per-segment data by a n_iter,seg_id INDEX instead of a seg_id indexed dataset in the group for n_iter.''' ) parser.add_argument( 'endpoints', metavar='N_ITER:SEG_ID', nargs='+', help= '''Trace trajectory ending (or at least alive at) N_ITER:SEG_ID.''' ) #tgroup = parser.add_argument_group('trace options') ogroup = parser.add_argument_group('output options') ogroup.add_argument( '--output-pattern', default='traj_%d_%d', help= '''Write per-trajectory data to output files/HDF5 groups whose names begin with OUTPUT_PATTERN, which must contain two printf-style format flags which will be replaced with the iteration number and segment ID of the terminal segment of the trajectory being traced. (Default: %(default)s.)''') ogroup.add_argument( '-o', '--output', default='trajs.h5', help= 'Store intermediate data and analysis results to OUTPUT (default: %(default)s).' ) def process_args(self, args): self.data_reader.process_args(args) #self.h5storage.process_args(args) self.endpoints = [ list(map(int, endpoint.split(':'))) for endpoint in args.endpoints ] self.output_pattern = args.output_pattern for dsstr in args.datasets or []: self.datasets.append(self.parse_dataset_string(dsstr)) #self.h5storage.open_analysis_h5file() self.output_file = h5py.File(args.output) def parse_dataset_string(self, dsstr): dsinfo = {} r = re.compile(r',(?=[^\]]*(?:\[|$))') fields = r.split(dsstr) dsinfo['dsname'] = fields[0] for field in (field.strip() for field in fields[1:]): k, v = field.split('=') k = k.lower() if k in ('alias', 'file', 'index'): dsinfo[k] = v elif k == 'slice': try: dsinfo['slice'] = eval('np.index_exp' + v) except SyntaxError: raise SyntaxError( 'invalid index expression {!r}'.format(v)) else: raise ValueError('invalid dataset option {!r}'.format(k)) return dsinfo def go(self): self.data_reader.open('r') #Create a new 'trajectories' group if this is the first trace try: trajs_group = h5io.create_hdf5_group(self.output_file, 'trajectories', replace=False, creating_program=self.prog) except ValueError: trajs_group = self.output_file['trajectories'] for n_iter, seg_id in self.endpoints: trajname = self.output_pattern % (n_iter, seg_id) trajgroup = trajs_group.create_group(trajname) trace = Trace.from_data_manager(n_iter, seg_id, self.data_reader.data_manager) with open(trajname + '_trace.txt', 'wt') as trace_output: self.emit_trace_text(trace, trace_output) self.emit_trace_h5(trace, trajgroup) aux_h5files = {} for dsinfo in self.datasets: dsname = dsinfo['dsname'] filename = dsinfo.get('file') if filename: try: aux_h5file = aux_h5files[filename] except KeyError: aux_h5file = aux_h5files[filename] = h5py.File( filename, 'r') else: aux_h5file = None slice_ = dsinfo.get('slice') alias = dsinfo.get('alias', dsname) index = dsinfo.get('index') data, weights = trace.trace_timepoint_dataset( dsname, auxfile=aux_h5file, slice_=slice_, index_ds=index) # Save data to HDF5 try: del trajgroup[alias] except KeyError: pass trajgroup[alias] = data # All weight vectors will be the same length, so only store in HDF5 once if not ('weights' in trajgroup and trajgroup['weights'].shape == weights.shape): try: del trajgroup['weights'] except KeyError: pass trajgroup['weights'] = weights def emit_trace_h5(self, trace, output_group): for dsname in ('basis_state', 'initial_state', 'segments'): try: del output_group[dsname] except KeyError: pass if trace.basis_state: output_group['basis_state'] = trace.basis_state.as_numpy_record() output_group['initial_state'] = trace.initial_state.as_numpy_record() output_group['segments'] = trace.summary def emit_trace_text(self, trace, output_file): '''Dump summary information about each segment in the given trace to the given output_file, which must be opened for writing in text mode. Output columns are separated by at least one space.''' if not trace: return pcoord_ndim = trace[0]['final_pcoord'].shape[0] lastseg = trace[-1] len_n_iter = max(6, len(str(lastseg['n_iter']))) len_seg_id = max(6, max(len(str(seg_id)) for seg_id in trace['seg_id'])) seg_pattern = ' '.join([ '{n_iter:{len_n_iter}d}', '{seg_id:{len_seg_id}d}', '{weight:22.17e}', '{walltime:10.6g}', '{cputime:10.6g}', '{pcoord_str:s}' ]) + '\n' output_file.write('''\ # Trace of trajectory ending in n_iter:seg_id {n_iter:d}:{seg_id:d} (endpoint type {endpoint_type_text:s}) # column 0: iteration (0 => initial state) # column 1: seg_id (or initial state ID) # column 2: weight # column 3: wallclock time (s) # column 4: CPU time (s) '''.format(n_iter=int(lastseg['n_iter']), seg_id=int(lastseg['seg_id']), endpoint_type_text=Segment.endpoint_type_names[ trace.endpoint_type])) if pcoord_ndim == 1: output_file.write('''\ # column 5: final progress coordinate value ''') else: fpcbegin = 5 fpcend = fpcbegin + pcoord_ndim - 1 output_file.write('''\ # columns {fpcbegin:d} -- {fpcend:d}: final progress coordinate value '''.format(fpcbegin=fpcbegin, fpcend=fpcend)) pcoord_formats = self.pcoord_formats # Output row for initial state initial_state = trace.initial_state pcoord_str = ' '.join( pcoord_formats.get(pcfield.dtype.str[1:], '%s') % pcfield for pcfield in initial_state.pcoord) output_file.write( seg_pattern.format(n_iter=0, seg_id=initial_state.state_id, weight=0.0, walltime=0, cputime=0, pcoord_str=pcoord_str, len_n_iter=len_n_iter, len_seg_id=len_seg_id)) # Output rows for segments for segment in trace: pcoord_str = ' '.join( pcoord_formats.get(pcfield.dtype.str[1:], '%s') % pcfield for pcfield in segment['final_pcoord']) output_file.write( seg_pattern.format(n_iter=int(segment['n_iter']), seg_id=int(segment['seg_id']), weight=float(segment['weight']), walltime=float(segment['walltime']), cputime=float(segment['cputime']), pcoord_str=pcoord_str, len_n_iter=len_n_iter, len_seg_id=len_seg_id))
class WPDist(WESTParallelTool): prog = 'w_pdist' description = '''\ Calculate time-resolved, multi-dimensional probability distributions of WE datasets. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Histogram binning ----------------------------------------------------------------------------- By default, histograms are constructed with 100 bins in each dimension. This can be overridden by specifying -b/--bins, which accepts a number of different kinds of arguments: a single integer N N uniformly spaced bins will be used in each dimension. a sequence of integers N1,N2,... (comma-separated) N1 uniformly spaced bins will be used for the first dimension, N2 for the second, and so on. a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...] The bin boundaries B11, B12, B13, ... will be used for the first dimension, B21, B22, B23, ... for the second dimension, and so on. These bin boundaries need not be uniformly spaced. These expressions will be evaluated with Python's ``eval`` construct, with ``np`` available for use [e.g. to specify bins using np.arange()]. The first two forms (integer, list of integers) will trigger a scan of all data in each dimension in order to determine the minimum and maximum values, which may be very expensive for large datasets. This can be avoided by explicitly providing bin boundaries using the list-of-lists form. Note that these bins are *NOT* at all related to the bins used to drive WE sampling. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file produced (specified by -o/--output, defaulting to "pdist.h5") may be fed to plothist to generate plots (or appropriately processed text or HDF5 files) from this data. In short, the following datasets are created: ``histograms`` Normalized histograms. The first axis corresponds to iteration, and remaining axes correspond to dimensions of the input dataset. ``/binbounds_0`` Vector of bin boundaries for the first (index 0) dimension. Additional datasets similarly named (/binbounds_1, /binbounds_2, ...) are created for additional dimensions. ``/midpoints_0`` Vector of bin midpoints for the first (index 0) dimension. Additional datasets similarly named are created for additional dimensions. ``n_iter`` Vector of iteration numbers corresponding to the stored histograms (i.e. the first axis of the ``histograms`` dataset). ----------------------------------------------------------------------------- Subsequent processing ----------------------------------------------------------------------------- The output generated by this program (-o/--output, default "pdist.h5") may be plotted by the ``plothist`` program. See ``plothist --help`` for more information. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading of input data. Parallel processing is the default. For simple cases (reading pre-computed input data, modest numbers of segments), serial processing (--serial) may be more efficient. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super().__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.input_wdssynth = WESTWDSSynthesizer(default_dsname='seg_index') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) parser.add_argument( '-b', '--bins', dest='bins', metavar='BINEXPR', default='100', help='''Use BINEXPR for bins. This may be an integer, which will be used for each dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...]) which will use n1 bins for the first dimension, n2 for the second dimension, and so on; or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries for the second dimension, and so on. (Default: 100 bins in each dimension.)''', ) parser.add_argument( '-o', '--output', dest='output', default='pdist.h5', help='''Store results in OUTPUT (default: %(default)s).''' ) parser.add_argument( '-C', '--compress', action='store_true', help='''Compress histograms. May make storage of higher-dimensional histograms more tractable, at the (possible extreme) expense of increased analysis time. (Default: no compression.)''', ) parser.add_argument( '--loose', dest='ignore_out_of_range', action='store_true', help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin boundaries appear as reasonable data. Only use if you are sure of your bin boundary specification.)''', ) igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False) igroup.add_argument( '--construct-dataset', help='''Use the given function (as in module.function) to extract source data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id][timepoint][dimension]''', ) igroup.add_argument( '--dsspecs', nargs='+', metavar='DSSPEC', help='''Construct probability distribution from one or more DSSPECs.''' ) wgroup = parser.add_argument_group('input weight dataset options').add_mutually_exclusive_group(required=False) wgroup.add_argument( '--construct-wdataset', help='''Use the given function (as in module.function) to extract weight data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id]''', ) self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) self.input_dssynth.h5filename = self.data_reader.we_h5filename self.input_dssynth.process_args(args) self.dsspec = self.input_dssynth.dsspec # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library # Open the WEST HDF5 file just long enough to process our iteration range, then close # and reopen in go() [which executes after the fork] with self.data_reader: self.iter_range.process_args(args) # Reading potential custom weights self.input_wdssynth.h5filename = self.data_reader.we_h5filename self.input_wdssynth.process_args(args) self.wt_dsspec = self.input_wdssynth.dsspec self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False def go(self): self.data_reader.open('r') pi = self.progress.indicator pi.operation = 'Initializing' with pi: self.output_file = h5py.File(self.output_filename, 'w') h5io.stamp_creator_data(self.output_file) self.iter_start = self.iter_range.iter_start self.iter_stop = self.iter_range.iter_stop # Construct bin boundaries self.construct_bins(self.parse_binspec(self.binspec)) for idim, (binbounds, midpoints) in enumerate(zip(self.binbounds, self.midpoints)): self.output_file['binbounds_{}'.format(idim)] = binbounds self.output_file['midpoints_{}'.format(idim)] = midpoints # construct histogram self.construct_histogram() # Record iteration range iter_range = self.iter_range.iter_range() self.output_file['n_iter'] = iter_range self.iter_range.record_data_iter_range(self.output_file['histograms']) self.output_file.close() @staticmethod def parse_binspec(binspec): namespace = {'numpy': np, 'np': np, 'inf': float('inf')} try: binspec_compiled = eval(binspec, namespace) except Exception as e: raise ValueError('invalid bin specification: {!r}'.format(e)) else: if log.isEnabledFor(logging.DEBUG): log.debug('bin specs: {!r}'.format(binspec_compiled)) return binspec_compiled def construct_bins(self, bins): ''' Construct bins according to ``bins``, which may be: 1) A scalar integer (for that number of bins in each dimension) 2) A sequence of integers (specifying number of bins for each dimension) 3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension) Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins. ''' if not isiterable(bins): self._construct_bins_from_scalar(bins) elif not isiterable(bins[0]): self._construct_bins_from_int_seq(bins) else: self._construct_bins_from_bound_seqs(bins) if log.isEnabledFor(logging.DEBUG): log.debug('binbounds: {!r}'.format(self.binbounds)) def scan_data_shape(self): if self.ndim is None: dset = self.dsspec.get_iter_data(self.iter_start) self.ntimepoints = dset.shape[1] self.ndim = dset.shape[2] self.dset_dtype = dset.dtype def scan_data_range(self): '''Scan input data for range in each dimension. The number of dimensions is determined from the shape of the progress coordinate as of self.iter_start.''' self.progress.indicator.new_operation('Scanning for data range', self.iter_stop - self.iter_start) self.scan_data_shape() dset_dtype = self.dset_dtype ndim = self.ndim dsspec = self.dsspec try: minval = np.finfo(dset_dtype).min maxval = np.finfo(dset_dtype).max except ValueError: minval = np.iinfo(dset_dtype).min maxval = np.iinfo(dset_dtype).max data_range = self.data_range = [(maxval, minval) for _i in range(self.ndim)] # futures = [] # for n_iter in xrange(self.iter_start, self.iter_stop): # _remote_min_max(ndim, dset_dtype, n_iter, dsspec) # futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {}) for n_iter in range(self.iter_start, self.iter_stop)), self.max_queue_len, ): bounds = future.get_result(discard=True) for idim in range(ndim): current_min, current_max = data_range[idim] current_min = min(current_min, bounds[idim][0]) current_max = max(current_max, bounds[idim][1]) data_range[idim] = (current_min, current_max) self.progress.indicator.progress += 1 def _construct_bins_from_scalar(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = np.linspace(lb, ub, bins + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_int_seq(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = np.linspace(lb, ub, bins[idim] + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_bound_seqs(self, bins): self.binbounds = [] self.midpoints = [] for boundset in bins: boundset = np.asarray(boundset) if (np.diff(boundset) <= 0).any(): raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset)) self.binbounds.append(boundset) self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0) def construct_histogram(self): '''Construct a histogram using bins previously constructed with ``construct_bins()``. The time series of histogram values is stored in ``histograms``. Each histogram in the time series is normalized.''' self.scan_data_shape() iter_count = self.iter_stop - self.iter_start histograms_ds = self.output_file.create_dataset( 'histograms', dtype=np.float64, shape=((iter_count,) + tuple(len(bounds) - 1 for bounds in self.binbounds)), compression=9 if self.compress_output else None, ) binbounds = [np.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds] self.progress.indicator.new_operation('Constructing histograms', self.iter_stop - self.iter_start) task_gen = ( ( _remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {}, ) for (iiter, n_iter) in enumerate(range(self.iter_start, self.iter_stop)) ) # futures = set() # for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)): # initpoint = 1 if iiter > 0 else 0 # futures.add(self.work_manager.submit(_remote_bin_iter, # args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds))) # for future in self.work_manager.as_completed(futures): # future = self.work_manager.wait_any(futures) # for future in self.work_manager.submit_as_completed(task_gen, self.queue_size): log.debug('max queue length: {!r}'.format(self.max_queue_len)) for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len): iiter, n_iter, iter_hist = future.get_result(discard=True) self.progress.indicator.progress += 1 # store histogram histograms_ds[iiter] = iter_hist del iter_hist, future
class WDumpSegs(WESTTool): prog = 'w_dumpsegs' description = '''\ Dump segment data as text. This is very inefficient, so this tool should be used as a last resort (use hdfview/h5ls to look at data, and access HDF5 directly for significant analysis tasks). ''' def __init__(self): super().__init__() self.data_reader = WESTDataReader() self.n_iter = None self.output_file = None self.print_pcoords = False def add_args(self, parser): self.data_reader.add_args(parser) parser.add_argument( '-p', '--print-pcoords', dest='print_pcoords', action='store_true', help= 'print initial and final progress coordinates for each segment', ) parser.add_argument( '-i', '--iteration', dest='n_iter', type=int, help= 'Use data from iteration N_ITER (default: last complete iteration)' ) parser.add_argument( '-o', '--output', dest='output_file', help= 'Store output in OUTPUT_FILE (default: write to standard output).') def process_args(self, args): self.data_reader.process_args(args) self.data_reader.open() self.n_iter = args.n_iter or self.data_reader.current_iteration - 1 self.output_file = open(args.output_file, 'wt') if args.output_file else sys.stdout self.print_pcoords = args.print_pcoords def go(self): segments = self.data_reader.get_segments(self.n_iter) max_seg_id_len = len(str(max(segment.seg_id for segment in segments))) max_status_name_len = max( list(map(len, iter(Segment.status_names.values())))) max_endpoint_type_len = max( list(map(len, iter(Segment.endpoint_type_names.values())))) max_n_parents_len = len( str(max(len(segment.wtg_parent_ids) for segment in segments))) report_line = ( '{segment.n_iter:d} {segment.seg_id:{max_seg_id_len}d} {segment.weight:20.14g}' + ' {status_name:{max_status_name_len}s} ({segment.status})' + ' {segment.walltime:<12.6g} {segment.cputime:<12.6g}' + ' {endpoint_type_name:{max_endpoint_type_len}s} ({segment.endpoint_type})' + ' {n_parents:{max_n_parents_len}d} {segment.parent_id:{max_seg_id_len}d} {parents_str}' + '\n') pcoord_lines = ' pcoord[0] = {init_pcoord}\n pcoord[-1] = {final_pcoord}' + '\n' for (_seg_id, segment) in enumerate(segments): parents_str = '[' + ', '.join( map(str, sorted(segment.wtg_parent_ids))) + ']' init_pcoord_str = '[' + ', '.join( '{pcval:<12.6g}'.format(pcval=float(pce)) for pce in segment.pcoord[0]) + ']' final_pcoord_str = '[' + ', '.join( '{pcval:<12.6g}'.format(pcval=float(pce)) for pce in segment.pcoord[-1]) + ']' self.output_file.write( report_line.format( segment=segment, status_name=segment.status_names[segment.status], endpoint_type_name=segment.endpoint_type_names[ segment.endpoint_type], parents_str=parents_str, n_parents=len(segment.wtg_parent_ids), max_seg_id_len=max_seg_id_len, max_status_name_len=max_status_name_len, max_endpoint_type_len=max_endpoint_type_len, max_n_parents_len=max_n_parents_len, )) if self.print_pcoords: self.output_file.write( pcoord_lines.format(init_pcoord=init_pcoord_str, final_pcoord=final_pcoord_str))
class WNTopTool(WESTTool): prog = 'w_ntop' description = '''\ Select walkers from bins . An assignment file mapping walkers to bins at each timepoint is required (see``w_assign --help`` for further information on generating this file). By default, high-weight walkers are selected (hence the name ``w_ntop``: select the N top-weighted walkers from each bin); however, minimum weight walkers and randomly-selected walkers may be selected instead. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "ntop.h5") contains the following datasets: ``/n_iter`` [iteration] *(Integer)* Iteration numbers for each entry in other datasets. ``/n_segs`` [iteration][bin] *(Integer)* Number of segments in each bin/state in the given iteration. This will generally be the same as the number requested with ``--n/--count`` but may be smaller if the requested number of walkers does not exist. ``/seg_ids`` [iteration][bin][segment] *(Integer)* Matching segments in each iteration for each bin. For an iteration ``n_iter``, only the first ``n_iter`` entries are valid. For example, the full list of matching seg_ids in bin 0 in the first stored iteration is ``seg_ids[0][0][:n_segs[0]]``. ``/weights`` [iteration][bin][segment] *(Floating-point)* Weights for each matching segment in ``/seg_ids``. ----------------------------------------------------------------------------- Command-line arguments ----------------------------------------------------------------------------- ''' def __init__(self): super(WNTopTool, self).__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_file = None self.assignments_filename = None self.output_filename = None self.what = None self.timepoint = None self.count = None def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) igroup = parser.add_argument_group('input options') igroup.add_argument( '-a', '--assignments', default='assign.h5', help= '''Use assignments from the given ASSIGNMENTS file (default: %(default)s).''' ) sgroup = parser.add_argument_group('selection options') sgroup.add_argument( '-n', '--count', type=int, default=1, help= '''Select COUNT walkers from each iteration for each bin (default: %(default)s).''' ) sgroup.add_argument( '-t', '--timepoint', type=int, default=-1, help= '''Base selection on the given TIMEPOINT within each iteration. Default (-1) corresponds to the last timepoint.''') cgroup = parser.add_mutually_exclusive_group() cgroup.add_argument( '--highweight', dest='select_what', action='store_const', const='highweight', help='''Select COUNT highest-weight walkers from each bin.''') cgroup.add_argument( '--lowweight', dest='select_what', action='store_const', const='lowweight', help='''Select COUNT lowest-weight walkers from each bin.''') cgroup.add_argument( '--random', dest='select_what', action='store_const', const='random', help='''Select COUNT walkers randomly from each bin.''') parser.set_defaults(select_what='highweight') ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='ntop.h5', help='''Write output to OUTPUT (default: %(default)s).''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) self.what = args.select_what self.output_filename = args.output self.assignments_filename = args.assignments self.count = args.count self.timepoint = args.timepoint def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=np.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = np.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + np.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = np.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = np.argsort(weights)[:count] elif what == 'highweight': indices = np.argsort(weights)[::-1][:count] else: assert what == 'random' indices = np.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
class WAssign(WESTParallelTool): prog = 'w_assign' description = '''\ Assign walkers to bins, producing a file (by default named "assign.h5") which can be used in subsequent analysis. For consistency in subsequent analysis operations, the entire dataset must be assigned, even if only a subset of the data will be used. This ensures that analyses that rely on tracing trajectories always know the originating bin of each trajectory. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Specifying macrostates ----------------------------------------------------------------------------- Optionally, kinetic macrostates may be defined in terms of sets of bins. Each trajectory will be labeled with the kinetic macrostate it was most recently in at each timepoint, for use in subsequent kinetic analysis. This is required for all kinetics analysis (w_kintrace and w_kinmat). There are three ways to specify macrostates: 1. States corresponding to single bins may be identified on the command line using the --states option, which takes multiple arguments, one for each state (separated by spaces in most shells). Each state is specified as a coordinate tuple, with an optional label prepended, as in ``bound:1.0`` or ``unbound:(2.5,2.5)``. Unlabeled states are named ``stateN``, where N is the (zero-based) position in the list of states supplied to --states. 2. States corresponding to multiple bins may use a YAML input file specified with --states-from-file. This file defines a list of states, each with a name and a list of coordinate tuples; bins containing these coordinates will be mapped to the containing state. For instance, the following file:: --- states: - label: unbound coords: - [9.0, 1.0] - [9.0, 2.0] - label: bound coords: - [0.1, 0.0] produces two macrostates: the first state is called "unbound" and consists of bins containing the (2-dimensional) progress coordinate values (9.0, 1.0) and (9.0, 2.0); the second state is called "bound" and consists of the single bin containing the point (0.1, 0.0). 3. Arbitrary state definitions may be supplied by a user-defined function, specified as --states-from-function=MODULE.FUNCTION. This function is called with the bin mapper as an argument (``function(mapper)``) and must return a list of dictionaries, one per state. Each dictionary must contain a vector of coordinate tuples with key "coords"; the bins into which each of these tuples falls define the state. An optional name for the state (with key "label") may also be provided. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "assign.h5") contains the following attributes datasets: ``nbins`` attribute *(Integer)* Number of valid bins. Bin assignments range from 0 to *nbins*-1, inclusive. ``nstates`` attribute *(Integer)* Number of valid macrostates (may be zero if no such states are specified). Trajectory ensemble assignments range from 0 to *nstates*-1, inclusive, when states are defined. ``/assignments`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint assignments (bin indices). ``/npts`` [iteration] *(Integer)* Number of timepoints in each iteration. ``/nsegs`` [iteration] *(Integer)* Number of segments in each iteration. ``/labeled_populations`` [iterations][state][bin] *(Floating-point)* Per-iteration and -timepoint bin populations, labeled by most recently visited macrostate. The last state entry (*nstates-1*) corresponds to trajectories initiated outside of a defined macrostate. ``/bin_labels`` [bin] *(String)* Text labels of bins. When macrostate assignments are given, the following additional datasets are present: ``/trajlabels`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint trajectory labels, indicating the macrostate which each trajectory last visited. ``/state_labels`` [state] *(String)* Labels of states. ``/state_map`` [bin] *(Integer)* Mapping of bin index to the macrostate containing that bin. An entry will contain *nbins+1* if that bin does not fall into a macrostate. Datasets indexed by state and bin contain one more entry than the number of valid states or bins. For *N* bins, axes indexed by bin are of size *N+1*, and entry *N* (0-based indexing) corresponds to a walker outside of the defined bin space (which will cause most mappers to raise an error). More importantly, for *M* states (including the case *M=0* where no states are specified), axes indexed by state are of size *M+1* and entry *M* refers to trajectories initiated in a region not corresponding to a defined macrostate. Thus, ``labeled_populations[:,:,:].sum(axis=1)[:,:-1]`` gives overall per-bin populations, for all defined bins and ``labeled_populations[:,:,:].sum(axis=2)[:,:-1]`` gives overall per-trajectory-ensemble populations for all defined states. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading/calculating input data. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super().__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.data_reader = WESTDataReader() self.dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.binning = BinMappingComponent() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.states = [] self.subsample = False def add_args(self, parser): self.data_reader.add_args(parser) self.binning.add_args(parser) self.dssynth.add_args(parser) sgroup = parser.add_argument_group( 'macrostate definitions').add_mutually_exclusive_group() sgroup.add_argument( '--states', nargs='+', metavar='STATEDEF', help= '''Single-bin kinetic macrostate, specified by a coordinate tuple (e.g. '1.0' or '[1.0,1.0]'), optionally labeled (e.g. 'bound:[1.0,1.0]'). States corresponding to multiple bins must be specified with --states-from-file.''', ) sgroup.add_argument( '--states-from-file', metavar='STATEFILE', help= '''Load kinetic macrostates from the YAML file STATEFILE. See description above for the appropriate structure.''', ) sgroup.add_argument( '--states-from-function', metavar='STATEFUNC', help= '''Load kinetic macrostates from the function STATEFUNC, specified as module_name.func_name. This function is called with the bin mapper as an argument, and must return a list of dictionaries {'label': state_label, 'coords': 2d_array_like} one for each macrostate; the 'coords' entry must contain enough rows to identify all bins in the macrostate.''', ) agroup = parser.add_argument_group('other options') agroup.add_argument( '-o', '--output', dest='output', default='assign.h5', help='''Store results in OUTPUT (default: %(default)s).''') agroup.add_argument( '--subsample', dest='subsample', action='store_const', const=True, help='''Determines whether or not the data should be subsampled. This is rather useful for analysing steady state simulations.''', ) agroup.add_argument( '--config-from-file', dest='config_from_file', action='store_true', help= '''Load bins/macrostates from a scheme specified in west.cfg.''', ) agroup.add_argument('--scheme-name', dest='scheme', help='''Name of scheme specified in west.cfg.''') def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) # Necessary to open the file to get the current iteration # if we want to use the mapper in the file self.data_reader.open(mode='r+') self.n_iter = self.data_reader.current_iteration # If we decide to use this option for iteration selection: # getattr(args,'bins_from_h5file',None) or self.data_reader.current_iteration with self.data_reader: self.dssynth.h5filename = self.data_reader.we_h5filename self.dssynth.process_args(args) if args.config_from_file is False: self.binning.set_we_h5file_info(self.n_iter, self.data_reader) self.binning.process_args(args) self.output_filename = args.output if args.config_from_file: if not args.scheme: raise ValueError('A scheme must be specified.') else: self.load_config_from_west(args.scheme) elif args.states: self.parse_cmdline_states(args.states) elif args.states_from_file: self.load_state_file(args.states_from_file) elif args.states_from_function: self.load_states_from_function( get_object(args.states_from_function, path=['.'])) if self.states and len(self.states) < 2: raise ValueError('zero, two, or more macrostates are required') # self.output_file = WESTPAH5File(args.output, 'w', creating_program=True) log.debug('state list: {!r}'.format(self.states)) self.subsample = args.subsample if args.subsample is not None else False def parse_cmdline_states(self, state_strings): states = [] for istring, state_string in enumerate(state_strings): try: (label, coord_str) = state_string.split(':') except ValueError: label = 'state{}'.format(istring) coord_str = state_string coord = parse_pcoord_value(coord_str) states.append({'label': label, 'coords': coord}) self.states = states def load_config_from_west(self, scheme): try: config = westpa.rc.config['west']['analysis'] except Exception: raise ValueError('There is no configuration file specified.') ystates = config['analysis_schemes'][scheme]['states'] self.states_from_dict(ystates) try: self.subsample = config['subsample'] except Exception: pass from westpa.core._rc import bins_from_yaml_dict self.binning.mapper = bins_from_yaml_dict( config['analysis_schemes'][scheme]['bins'][0]) path = os.path.join(os.getcwd(), config['directory'], scheme) try: os.mkdir(config['directory']) os.mkdir(path) except Exception: pass self.output_filename = os.path.join(path, 'assign.h5') def load_state_file(self, state_filename): import yaml ydict = yaml.load(open(state_filename, 'rt')) ystates = ydict['states'] self.states_from_dict(ystates) def states_from_dict(self, ystates): states = [] for istate, ystate in enumerate(ystates): state = {} state['label'] = ystate.get('label', 'state{}'.format(istate)) # coords can be: # - a scalar, in which case it is one bin, 1-D # - a single list, which is rejected as ambiguous # - a list of lists, which is a list of coordinate tuples coords = np.array(ystate['coords']) if coords.ndim == 0: coords.shape = (1, 1) elif coords.ndim == 1: raise ValueError( 'list {!r} is ambiguous (list of 1-d coordinates, or single multi-d coordinate?)' .format(ystate['coords'])) elif coords.ndim > 2: raise ValueError('coordinates must be 2-D') state['coords'] = coords states.append(state) self.states = states def load_states_from_function(self, statefunc): states = statefunc(self.binning.mapper) for istate, state in enumerate(states): state.setdefault('label', 'state{}'.format(istate)) try: state['coords'] = np.array(state['coords']) except KeyError: raise ValueError( 'state function {!r} returned a state {!r} without coordinates' .format(statefunc, state)) self.states = states log.debug('loaded states: {!r}'.format(self.states)) def assign_iteration(self, n_iter, nstates, nbins, state_map, last_labels): '''Method to encapsulate the segment slicing (into n_worker slices) and parallel job submission Submits job(s), waits on completion, splices them back together Returns: assignments, trajlabels, pops for this iteration''' futures = [] iter_group = self.data_reader.get_iter_group(n_iter) nsegs, npts = iter_group['pcoord'].shape[:2] n_workers = self.work_manager.n_workers or 1 assignments = np.empty((nsegs, npts), dtype=index_dtype) trajlabels = np.empty((nsegs, npts), dtype=index_dtype) statelabels = np.empty((nsegs, npts), dtype=index_dtype) pops = np.zeros((nstates + 1, nbins + 1), dtype=weight_dtype) # Submit jobs to work manager blocksize = nsegs // n_workers if nsegs % n_workers > 0: blocksize += 1 def task_gen(): if __debug__: checkset = set() for lb in range(0, nsegs, blocksize): ub = min(nsegs, lb + blocksize) if __debug__: checkset.update(set(range(lb, ub))) args = () kwargs = dict( n_iter=n_iter, lb=lb, ub=ub, mapper=self.binning.mapper, nstates=nstates, state_map=state_map, last_labels=last_labels, parent_id_dsspec=self.data_reader.parent_id_dsspec, weight_dsspec=self.data_reader.weight_dsspec, pcoord_dsspec=self.dssynth.dsspec, subsample=self.subsample, ) yield (_assign_label_pop, args, kwargs) # futures.append(self.work_manager.submit(_assign_label_pop, # kwargs=) if __debug__: assert checkset == set( range(nsegs)), 'segments missing: {}'.format( set(range(nsegs)) - checkset) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( task_gen(), queue_size=self.max_queue_len): assign_slice, traj_slice, slice_pops, lb, ub, state_slice = future.get_result( discard=True) assignments[lb:ub, :] = assign_slice trajlabels[lb:ub, :] = traj_slice statelabels[lb:ub, :] = state_slice pops += slice_pops del assign_slice, traj_slice, slice_pops, state_slice del futures return (assignments, trajlabels, pops, statelabels) def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File( self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = np.empty((self.binning.mapper.nbins + 1, ), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [ np.string_(label) for label in self.binning.mapper.labels ] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [ np.string_(state['label']) for state in self.states ] for istate, sdict in enumerate(self.states): assert state_labels[istate] == np.string_( sdict['label']) # sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file[ 'state_labels'] = state_labels # + ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates # Stamp if this has been subsampled. self.output_file.attrs['subsampled'] = self.subsample iter_count = iter_stop - iter_start nsegs = np.empty((iter_count, ), seg_id_dtype) npts = np.empty((iter_count, ), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation('Scanning for segment and point counts', iter_stop - iter_start) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count, max_nsegs, max_npts) assignments_dtype = np.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset( 'assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins, ) if self.states: trajlabel_dtype = np.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset( 'trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates, ) statelabels_ds = self.output_file.create_dataset( 'statelabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates, ) pops_shape = (iter_count, nstates + 1, nbins + 1) pops_ds = self.output_file.create_dataset( 'labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype), ) h5io.label_axes( pops_ds, [np.string_(i) for i in ['iteration', 'state', 'bin']]) pi.new_operation('Assigning to bins', iter_stop - iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(range(iter_start, iter_stop)): # get iteration info in this block if iiter == 0: last_labels = np.empty((nsegs[iiter], ), index_dtype) last_labels[:] = nstates # unknown state # Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops, statelabels = self.assign_iteration( n_iter, nstates, nbins, state_map, last_labels) # Do stuff with this iteration's results last_labels = trajlabels[:, -1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels statelabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = statelabels pi.progress += 1 del assignments, trajlabels, pops, statelabels for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)
class WCrawl(WESTParallelTool): prog = 'w_crawl' description = '''\ Crawl a weighted ensemble dataset, executing a function for each iteration. This can be used for postprocessing of trajectories, cleanup of datasets, or anything else that can be expressed as "do X for iteration N, then do something with the result". Tasks are parallelized by iteration, and no guarantees are made about evaluation order. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WCrawl, self).__init__() # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection(self.data_reader) self.crawler = None self.task_callable = None def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) tgroup = parser.add_argument_group('task options') tgroup.add_argument( '-c', '--crawler-instance', help= '''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of WESTPACrawler to coordinate the calculation. Required only if initialization, finalization, or task result processing is required.''' ) tgroup.add_argument( 'task_callable', help= '''Run TASK_CALLABLE (specified as module.function) on each iteration. Required.''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) self.task_callable = get_object(args.task_callable, path=['.']) if args.crawler_instance is not None: self.crawler = get_object(args.crawler_instance, path=['.']) else: self.crawler = WESTPACrawler() def go(self): iter_start = self.iter_range.iter_start iter_stop = self.iter_range.iter_stop iter_count = iter_stop - iter_start self.data_reader.open('r') pi = self.progress.indicator with pi: pi.operation = 'Initializing' self.crawler.initialize(iter_start, iter_stop) try: pi.new_operation('Dispatching tasks & processing results', iter_count) task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in range(iter_start, iter_stop)) for future in self.work_manager.submit_as_completed( task_gen, self.max_queue_len): n_iter, result = future.get_result(discard=True) if self.crawler is not None: self.crawler.process_iter_result(n_iter, result) pi.progress += 1 finally: pi.new_operation('Finalizing') self.crawler.finalize()
class WSelectTool(WESTParallelTool): prog = 'w_select' description = '''\ Select dynamics segments matching various criteria. This requires a user-provided prediate function. By default, only matching segments are stored. If the -a/--include-ancestors option is given, then matching segments and their ancestors will be stored. ----------------------------------------------------------------------------- Predicate function ----------------------------------------------------------------------------- Segments are selected based on a predicate function, which must be callable as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs matching the predicate in that iteration. The predicate may be inverted by specifying the -v/--invert command-line argument. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "select.h5") contains the following datasets: ``/n_iter`` [iteration] *(Integer)* Iteration numbers for each entry in other datasets. ``/n_segs`` [iteration] *(Integer)* Number of segment IDs matching the predicate (or inverted predicate, if -v/--invert is specified) in the given iteration. ``/seg_ids`` [iteration][segment] *(Integer)* Matching segments in each iteration. For an iteration ``n_iter``, only the first ``n_iter`` entries are valid. For example, the full list of matching seg_ids in the first stored iteration is ``seg_ids[0][:n_segs[0]]``. ``/weights`` [iteration][segment] *(Floating-point)* Weights for each matching segment in ``/seg_ids``. ----------------------------------------------------------------------------- Command-line arguments ----------------------------------------------------------------------------- ''' def __init__(self): super().__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.predicate = None self.invert = False self.include_ancestors = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) sgroup = parser.add_argument_group('selection options') sgroup.add_argument( '-p', '--predicate-function', metavar='MODULE.FUNCTION', help= '''Use the given predicate function to match segments. This function should take an iteration number and the HDF5 group corresponding to that iteration and return a sequence of seg_ids matching the predicate, as in ``match_predicate(n_iter, iter_group)``.''', ) sgroup.add_argument('-v', '--invert', dest='invert', action='store_true', help='''Invert the match predicate.''') sgroup.add_argument( '-a', '--include-ancestors', action='store_true', help='''Include ancestors of matched segments in output.''') ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='select.h5', help='''Write output to OUTPUT (default: %(default)s).''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) predicate = get_object(args.predicate_function, path=['.']) if not callable(predicate): raise TypeError( 'predicate object {!r} is not callable'.format(predicate)) self.predicate = predicate self.invert = bool(args.invert) self.include_ancestors = bool(args.include_ancestors) self.output_filename = args.output def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=np.uint, shape=(iter_count, )) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype), shuffle=True, compression=9, ) weights_ds = output_file.create_dataset( 'weights', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=weight_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), weight_dtype), shuffle=True, compression=9, ) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_find_matching_segments, (self.data_reader.we_h5filename, n_iter, self.predicate, self.invert), {}) for n_iter in range(iter_start, iter_stop)), self.max_queue_len, ): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches seg_count_ds[n_iter - iter_start] = n_matches matching_segs_ds[n_iter - iter_start, :n_matches] = matching_ids weights_ds[n_iter - iter_start, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in range(iter_stop - 1, iter_start - 1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update( matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter, :n_matches] = matching_ids weights_ds[ iiter, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)[ 'seg_index']['parent_id'][sorted(matching_ids)] from_previous.update( parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1
class WIPI(WESTParallelTool): ''' Welcome to w_ipa (WESTPA Interactive Python Analysis)! From here, you can run traces, look at weights, progress coordinates, etc. This is considered a 'stateful' tool; that is, the data you are pulling is always pulled from the current analysis scheme and iteration. By default, the first analysis scheme in west.cfg is used, and you are set at iteration 1. ALL PROPERTIES ARE ACCESSED VIA w or west To see the current iteration, try: w.iteration OR west.iteration to set it, simply plug in a new value. w.iteration = 100 To change/list the current analysis schemes: w.list_schemes w.scheme = OUTPUT FROM w.list_schemes To see the states and bins defined in the current analysis scheme: w.states w.bin_labels All information about the current iteration is available in an object called 'current': w.current walkers, summary, states, seg_id, weights, parents, kinavg, pcoord, bins, populations, and auxdata, if it exists. In addition, the function w.trace(seg_id) will run a trace over a seg_id in the current iteration and return a dictionary containing all pertinent information about that seg_id's history. It's best to store this, as the trace can be expensive. Run help on any function or property for more information! Happy analyzing! ''' def __init__(self): super().__init__() self.data_reader = WESTDataReader() self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.progress = ProgressIndicatorComponent() self._iter = 1 self.config_required = True self.version = "1.0B" # Set to matplotlib if you want that. But why would you? # Well, whatever, we'll just set it to that for now. self.interface = 'matplotlib' self._scheme = None global iteration def add_args(self, parser): self.progress.add_args(parser) self.data_reader.add_args(parser) rgroup = parser.add_argument_group('runtime options') rgroup.add_argument( '--analysis-only', '-ao', dest='analysis_mode', action='store_true', help= '''Use this flag to run the analysis and return to the terminal.''', ) rgroup.add_argument( '--reanalyze', '-ra', dest='reanalyze', action='store_true', help= '''Use this flag to delete the existing files and reanalyze.''', ) rgroup.add_argument('--ignore-hash', '-ih', dest='ignore_hash', action='store_true', help='''Ignore hash and don't regenerate files.''') rgroup.add_argument( '--debug', '-d', dest='debug_mode', action='store_true', help='''Debug output largely intended for development.''') rgroup.add_argument('--terminal', '-t', dest='plotting', action='store_true', help='''Plot output in terminal.''') # There is almost certainly a better way to handle this, but we'll sort that later. import argparse rgroup.add_argument('--f', '-f', dest='extra', default='blah', help=argparse.SUPPRESS) parser.set_defaults(compression=True) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.niters = self.data_reader.current_iteration - 1 self.__config = westpa.rc.config self.__settings = self.__config['west']['analysis'] for ischeme, scheme in enumerate(self.__settings['analysis_schemes']): if (self.__settings['analysis_schemes'][scheme]['enabled'] is True or self.__settings['analysis_schemes'][scheme]['enabled'] is None): self.scheme = scheme self.data_args = args self.analysis_mode = args.analysis_mode self.reanalyze = args.reanalyze self.ignore_hash = args.ignore_hash self.debug_mode = args.debug_mode if args.plotting: self.interface = 'text' def hash_args(self, args, extra=None, path=None): '''Create unique hash stamp to determine if arguments/file is different from before.''' '''Combine with iteration to know whether or not file needs updating.''' # Why are we not loading this functionality into the individual tools? # While it may certainly be useful to store arguments (and we may well do that), # it's rather complex and nasty to deal with pickling and hashing arguments through # the various namespaces. # In addition, it's unlikely that the functionality is desired at the individual tool level, # since we'll always just rewrite a file when we call the function. # return hashlib.md5(pickle.dumps([args, extra])).hexdigest() # We don't care about the path, so we'll remove it. # Probably a better way to do this, but who cares. cargs = list(args) for iarg, arg in enumerate(cargs): if path in arg: cargs[iarg] = arg.replace(path, '').replace('/', '') if arg == '--disable-averages': cargs.remove('--disable-averages') to_hash = cargs + [extra] # print(args) # print(to_hash) # print(str(to_hash).encode('base64')) if self.debug_mode: for iarg, arg in enumerate(to_hash): if not isinstance(arg, list): print('arg {num:02d} -- {arg:<20}'.format(num=iarg, arg=arg)) else: for il, l in enumerate(arg): print('arg {num:02d} -- {arg:<20}'.format(num=il + iarg, arg=l)) # print('args: {}'.format(to_hash)) # This SHOULD produce the same output, maybe? That would be nice, anyway. # But we'll need to test it more. return hashlib.md5(base64.b64encode(str(to_hash).encode())).hexdigest() def stamp_hash(self, h5file_name, new_hash): '''Loads a file, stamps it, and returns the opened file in read only''' h5file = h5io.WESTPAH5File(h5file_name, 'r+') h5file.attrs['arg_hash'] = new_hash h5file.close() h5file = h5io.WESTPAH5File(h5file_name, 'r') return h5file def analysis_structure(self): ''' Run automatically on startup. Parses through the configuration file, and loads up all the data files from the different analysis schematics. If they don't exist, it creates them automatically by hooking in to existing analysis routines and going from there. It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list of args. The user can specify everything in the configuration file that would have been specified on the command line. For instance, were one to call w_direct as follows: w_direct --evolution cumulative --step-iter 1 --disable-correl the west.cfg would look as follows: west: analysis: w_direct: evolution: cumulative step_iter: 1 extra: ['disable-correl'] Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced with 'kinetics'. ''' # Make sure everything exists. try: os.mkdir(self.__settings['directory']) except Exception: pass # Now, check to see whether they exist, and then load them. self.__analysis_schemes__ = {} # We really need to implement some sort of default behavior if an analysis scheme isn't set. # Right now, we just crash. That isn't really graceful. for scheme in self.__settings['analysis_schemes']: if self.__settings['analysis_schemes'][scheme]['enabled']: if self.work_manager.running is False: self.work_manager.startup() path = os.path.join(os.getcwd(), self.__settings['directory'], scheme) # if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']: # Should clean this up. But it uses the default global setting if a by-scheme one isn't set. if 'postanalysis' in self.__settings: if 'postanalysis' in self.__settings['analysis_schemes'][ scheme]: pass else: self.__settings['analysis_schemes'][scheme][ 'postanalysis'] = self.__settings['postanalysis'] try: os.mkdir(path) except Exception: pass self.__analysis_schemes__[scheme] = {} try: if (self.__settings['analysis_schemes'][scheme] ['postanalysis'] is True or self.__settings['postanalysis'] is True): analysis_files = ['assign', 'direct', 'reweight'] else: analysis_files = ['assign', 'direct'] except Exception: analysis_files = ['assign', 'direct'] self.__settings['analysis_schemes'][scheme][ 'postanalysis'] = False reanalyze_kinetics = False assign_hash = None for name in analysis_files: arg_hash = None if self.reanalyze is True: reanalyze_kinetics = True try: os.remove(os.path.join(path, '{}.h5'.format(name))) except Exception: pass else: try: # Try to load the hash. If we fail to load the hash or the file, we need to reload. # if self.reanalyze == True: # raise ValueError('Reanalyze set to true.') self.__analysis_schemes__[scheme][ name] = h5io.WESTPAH5File( os.path.join(path, '{}.h5'.format(name)), 'r') arg_hash = self.__analysis_schemes__[scheme][ name].attrs['arg_hash'] if name == 'assign': assign_hash = arg_hash except Exception: pass # We shouldn't rely on this. # self.reanalyze = True if True: if name == 'assign': assign = w_assign.WAssign() w_assign_config = { 'output': os.path.join(path, '{}.h5'.format(name)) } try: w_assign_config.update( self.__settings['w_assign']) except Exception: pass try: w_assign_config.update( self.__settings['analysis_schemes'][scheme] ['w_assign']) except Exception: pass args = [] for key, value in w_assign_config.items(): if key != 'extra': args.append( str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(w_assign_config.keys()): # We're sorting to ensure that the order doesn't matter. for value in sorted(w_assign_config['extra']): args.append( str('--') + str(value).replace('_', '-')) # We're just calling the built in function. # This is a lot cleaner than what we had in before, and far more workable. args.append('--config-from-file') args.append('--scheme-name') args.append('{}'.format(scheme)) # Why are we calling this if we're not sure we're remaking the file? # We need to load up the bin mapper and states and see if they're the same. assign.make_parser_and_process(args=args) import pickle # new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states]) # We need to encode it properly to ensure that some OS specific thing doesn't kill us. Same goes for the args, ultimately. # Mostly, we just need to ensure that we're consistent. new_hash = self.hash_args( args=args, path=path, extra=[ int(self.niters), codecs.encode( pickle.dumps(assign.binning.mapper), "base64"), base64.b64encode( str(assign.states).encode()), ], ) # Let's check the hash. If the hash is the same, we don't need to reload. if self.debug_mode is True: print('{:<10}: old hash, new hash -- {}, {}'. format(name, arg_hash, new_hash)) if self.ignore_hash is False and ( arg_hash != new_hash or self.reanalyze is True): # If the hashes are different, or we need to reanalyze, delete the file. try: os.remove( os.path.join(path, '{}.h5'.format(name))) except Exception: pass print('Reanalyzing file {}.h5 for scheme {}.'. format(name, scheme)) # reanalyze_kinetics = True # We want to use the work manager we have here. Otherwise, just let the tool sort out what it needs, honestly. assign.work_manager = self.work_manager assign.go() assign.data_reader.close() # Stamp w/ hash, then reload as read only. self.__analysis_schemes__[scheme][ name] = self.stamp_hash( os.path.join(path, '{}.h5'.format(name)), new_hash) del assign # Update the assignment hash. assign_hash = new_hash # Since these are all contained within one tool, now, we want it to just... load everything. if name == 'direct' or name == 'reweight': if name == 'direct': analysis = w_direct.WDirect() if name == 'reweight': analysis = w_reweight.WReweight() analysis_config = { 'assignments': os.path.join(path, '{}.h5'.format('assign')), 'output': os.path.join(path, '{}.h5'.format(name)), 'kinetics': os.path.join(path, '{}.h5'.format(name)), } # Pull from general analysis options, then general SPECIFIC options for each analysis, # then general options for that analysis scheme, then specific options for the analysis type in the scheme. try: analysis_config.update( self.__settings['kinetics']) except Exception: pass try: analysis_config.update( self.__settings['w_{}'.format(name)]) except Exception: pass try: analysis_config.update( self.__settings['analysis_schemes'][scheme] ['kinetics']) except Exception: pass try: analysis_config.update( self.__settings['analysis_schemes'][scheme] ['w_{}'.format(name)]) except Exception: pass # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in. # The tool has no real idea it's being called outside of its actual function, and we're good to go. args = ['all'] for key, value in analysis_config.items(): if key != 'extra': args.append( str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(analysis_config.keys()): for value in sorted(analysis_config['extra']): args.append( str('--') + str(value).replace('_', '-')) # We want to not display the averages, so... args.append('--disable-averages') new_hash = self.hash_args( args=args, path=path, extra=[int(self.niters), assign_hash]) # if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True: if self.debug_mode is True: print('{:<10}: old hash, new hash -- {}, {}'. format(name, arg_hash, new_hash)) if self.ignore_hash is False and ( arg_hash != new_hash or reanalyze_kinetics is True): try: os.remove( os.path.join(path, '{}.h5'.format(name))) except Exception: pass print('Reanalyzing file {}.h5 for scheme {}.'. format(name, scheme)) analysis.make_parser_and_process(args=args) # We want to hook into the existing work manager. analysis.work_manager = self.work_manager analysis.go() # Open! self.__analysis_schemes__[scheme][ name] = self.stamp_hash( os.path.join(path, '{}.h5'.format(name)), new_hash) del analysis # Make sure this doesn't get too far out, here. We need to keep it alive as long as we're actually analyzing things. # self.work_manager.shutdown() print("") print("Complete!") @property def assign(self): return self.__analysis_schemes__[str(self.scheme)]['assign'] @property def direct(self): """ The output from w_kinavg.py from the current scheme. """ return self.__analysis_schemes__[str(self.scheme)]['direct'] @property def state_labels(self): print("State labels and definitions!") for istate, state in enumerate(self.assign['state_labels']): print('{}: {}'.format(istate, state)) print('{}: {}'.format(istate + 1, 'Unknown')) @property def bin_labels(self): print("Bin definitions! ") for istate, state in enumerate(self.assign['bin_labels']): print('{}: {}'.format(istate, state)) @property def west(self): return self.data_reader.data_manager.we_h5file @property def reweight(self): if self.__settings['analysis_schemes'][str( self.scheme)]['postanalysis'] is True: return self.__analysis_schemes__[str(self.scheme)]['reweight'] else: value = "This sort of analysis has not been enabled." current = { 'bin_prob_evolution': value, 'color_prob_evolution': value, 'conditional_flux_evolution': value, 'rate_evolution': value, 'state_labels': value, 'state_prob_evolution': value, } current.update({'bin_populations': value, 'iterations': value}) return current @property def scheme(self): ''' Returns and sets what scheme is currently in use. To see what schemes are available, run: w.list_schemes ''' # Let's do this a few different ways. # We want to return things about the DIFFERENT schemes, if possible. if self._scheme is None: self._scheme = WIPIScheme(scheme=self.__analysis_schemes__, name=self._schemename, parent=self, settings=self.__settings) # This just ensures that when we call it, it's clean. self._scheme.name = None return self._scheme @scheme.setter def scheme(self, scheme): self._future = None self._current = None self._past = None if scheme in self.__settings['analysis_schemes']: pass else: for ischeme, schemename in enumerate( self.__settings['analysis_schemes']): if ischeme == scheme: scheme = schemename if (self.__settings['analysis_schemes'][scheme]['enabled'] is True or self.__settings['analysis_schemes'][scheme]['enabled'] is None): self._schemename = scheme else: print("Scheme cannot be changed to scheme: {}; it is not enabled!". format(scheme)) @property def list_schemes(self): ''' Lists what schemes are configured in west.cfg file. Schemes should be structured as follows, in west.cfg: west: system: analysis: directory: analysis analysis_schemes: scheme.1: enabled: True states: - label: unbound coords: [[7.0]] - label: bound coords: [[2.7]] bins: - type: RectilinearBinMapper boundaries: [[0.0, 2.80, 7, 10000]] ''' # print("The following schemes are available:") # print("") # for ischeme, scheme in enumerate(self.__settings['analysis_schemes']): # print('{}. Scheme: {}'.format(ischeme, scheme)) # print("") # print("Set via name, or via the index listed.") # print("") # print("Current scheme: {}".format(self.scheme)) self._scheme.list_schemes @property def iteration(self): ''' Returns/sets the current iteration. ''' # print("The current iteration is {}".format(self._iter)) return self._iter @iteration.setter def iteration(self, value): print("Setting iteration to iter {}.".format(value)) if value <= 0: print("Iteration must begin at 1.") value = 1 if value > self.niters: print("Cannot go beyond {} iterations!".format(self.niters)) print("Setting to {}".format(self.niters)) value = self.niters # We want to trigger a rebuild on our current/past/future bits. # The scheme should automatically reset to the proper iteration, but # future needs to be manually triggered. self._iter = value self._future = None return self._iter @property def current(self): ''' The current iteration. See help for __get_data_for_iteration__ ''' return self.scheme[self.scheme.scheme].current @property def past(self): ''' The previous iteration. See help for __get_data_for_iteration__ ''' return self.scheme[self.scheme.scheme].past def trace(self, seg_id): ''' Runs a trace on a seg_id within the current iteration, all the way back to the beginning, returning a dictionary containing all interesting information: seg_id, pcoord, states, bins, weights, iteration, auxdata (optional) sorted in chronological order. Call with a seg_id. ''' if seg_id >= self.current.walkers: print("Walker seg_id # {} is beyond the max count of {} walkers.". format(seg_id, self.current.walkers)) return 1 pi = self.progress.indicator with pi: pi.new_operation( 'Tracing scheme:iter:seg_id {}:{}:{}'.format( self.scheme, self.iteration, seg_id), self.iteration) current = { 'seg_id': [], 'pcoord': [], 'states': [], 'weights': [], 'iteration': [], 'bins': [] } keys = [] try: current['auxdata'] = {} for key in list(self.current['auxdata'].keys()): current['auxdata'][key] = [] key = [] except Exception: pass for iter in reversed(list(range(1, self.iteration + 1))): iter_group = self.data_reader.get_iter_group(iter) current['pcoord'].append(iter_group['pcoord'][seg_id, :, :]) current['states'].append(self.assign['trajlabels'][iter - 1, seg_id, :]) current['bins'].append(self.assign['assignments'][iter - 1, seg_id, :]) current['seg_id'].append(seg_id) current['weights'].append( iter_group['seg_index']['weight'][seg_id]) current['iteration'].append(iter) try: for key in keys: current['auxdata'][key].append( iter_group['auxdata'][key][seg_id]) except Exception: pass seg_id = iter_group['seg_index']['parent_id'][seg_id] if seg_id < 0: # Necessary for steady state simulations. This means they started in that iteration. break pi.progress += 1 current['seg_id'] = list(reversed(current['seg_id'])) current['iteration'] = list(reversed(current['iteration'])) current['states'] = np.concatenate( np.array(list(reversed(current['states'])))) current['bins'] = np.concatenate( np.array(list(reversed(current['bins'])))) current['weights'] = np.array(list(reversed(current['weights']))) current['pcoord'] = np.concatenate( np.array(list(reversed(current['pcoord'])))) try: for key in keys(): current['auxdata'][key] = np.concatenate( np.array(list(reversed(current['auxdata'][key])))) except Exception: pass current['state_labels'] = self.assign['state_labels'] for i in ['pcoord', 'states', 'bins', 'weights']: current[i] = WIPIDataset(raw=current[i], key=i) if i == 'weights': current[i].plotter = Plotter(np.log10(current[i].raw), str('log10 of ' + str(i)), iteration=current[i].raw.shape[0], interface=self.interface) else: current[i].plotter = Plotter(current[i].raw, i, iteration=current[i].raw.shape[0], interface=self.interface) current[i].plot = current[i].plotter.plot return WIPIDataset(raw=current, key=seg_id) @property def future(self, value=None): ''' Similar to current/past, but keyed differently and returns different datasets. See help for Future. ''' if self._future is None: self._future = self.Future(raw=self.__get_children__(), key=None) self._future.iteration = self.iteration + 1 return self._future class Future(WIPIDataset): # This isn't a real fancy one. def __getitem__(self, value): if isinstance(value, str): print(list(self.__dict__.keys())) try: return self.__dict__['raw'][value] except Exception: print('{} is not a valid data structure.'.format(value)) elif isinstance(value, int) or isinstance(value, np.int64): # Otherwise, we assume they're trying to index for a seg_id. # if value < self.parent.walkers: current = {} current['pcoord'] = self.__dict__['raw']['pcoord'][value] current['states'] = self.__dict__['raw']['states'][value] current['bins'] = self.__dict__['raw']['bins'][value] current['parents'] = self.__dict__['raw']['parents'][value] current['seg_id'] = self.__dict__['raw']['seg_id'][value] current['weights'] = self.__dict__['raw']['weights'][value] try: current['auxdata'] = {} for key in list(self.__dict__['raw']['auxdata'].keys()): current['auxdata'][key] = self.__dict__['raw'][ 'auxdata'][key][value] except Exception: pass current = WIPIDataset( current, 'Segment {} in Iter {}'.format(value, self.iteration)) return current def __get_children__(self): ''' Returns all information about the children of a given walker in the current iteration. Used to generate and create the future object, if necessary. ''' if self.iteration == self.niters: print( "Currently at iteration {}, which is the max. There are no children!" .format(self.iteration)) return 0 iter_data = __get_data_for_iteration__(value=self.iteration + 1, parent=self) future = { 'weights': [], 'pcoord': [], 'parents': [], 'summary': iter_data['summary'], 'seg_id': [], 'walkers': iter_data['walkers'], 'states': [], 'bins': [], } for seg_id in range(0, self.current.walkers): children = np.where(iter_data['parents'] == seg_id)[0] if len(children) == 0: error = "No children for seg_id {}.".format(seg_id) future['weights'].append(error) future['pcoord'].append(error) future['parents'].append(error) future['seg_id'].append(error) future['states'].append(error) future['bins'].append(error) else: # Now, we're gonna put them in the thing. value = self.iteration + 1 future['weights'].append(iter_data['weights'][children]) future['pcoord'].append( iter_data['pcoord'][...][children, :, :]) try: aux_data = iter_data['auxdata'][...][children, :, :] try: future['aux_data'].append(aux_data) except Exception: future['aux_data'] = aux_data except Exception: pass future['parents'].append(iter_data['parents'][children]) future['seg_id'].append(iter_data['seg_id'][children]) future['states'].append(self.assign['trajlabels'][value - 1, children, :]) future['bins'].append(self.assign['assignments'][value - 1, children, :]) return future def go(self): ''' Function automatically called by main() when launched via the command line interface. Generally, call main, not this function. ''' w = self print("") print("Welcome to w_ipa (WESTPA Interactive Python Analysis) v. {}!". format(w.version)) print( "Run w.introduction for a more thorough introduction, or w.help to see a list of options." ) print("Running analysis & loading files.") self.data_reader.open() self.analysis_structure() # Seems to be consistent with other tools, such as w_assign. For setting the iterations. self.data_reader.open() self.niters = self.data_reader.current_iteration - 1 self.iteration = self.niters try: print('Your current scheme, system and iteration are : {}, {}, {}'. format(w.scheme, os.getcwd(), w.iteration)) except Exception: pass @property def introduction(self): ''' Just spits out an introduction, in case someone doesn't call help. ''' help_string = ''' Call as a dictionary item or a .attribute: w.past, w.current, w.future: {current} Raw schemes can be accessed as follows: w.scheme.{scheme_keys} and contain mostly the same datasets associated with w. The following give raw access to the h5 files associated with the current scheme w.west w.assign w.direct w.reweight OTHER: {w} '''.format( current=self.__format_keys__(self.current.__dir__(), split=' ', offset=12), scheme_keys=self.__format_keys__(list(self._scheme.raw.keys())), w=self.__format_keys__(self.__dir__(), offset=8, max_length=0, split='', prepend='w.'), ) print(help_string) # Just a little function to be used with the introduction. def __format_keys__(self, keys, split='/', offset=0, max_length=80, prepend=''): rtn = '' run_length = 0 for key in keys: rtn += prepend + str(key) + split run_length += len(str(key)) if run_length >= max_length: run_length = offset rtn += '\n' + ' ' * offset if rtn[-1] == split: return rtn[:-1] else: return rtn @property def help(self): ''' Just a minor function to call help on itself. Only in here to really help someone get help.''' help(self) def _repr_pretty_(self, p, cycle): self.introduction return " " def __dir__(self): return_list = ['past', 'current', 'future'] # For the moment, don't expose direct, reweight, or assign, as these are scheme dependent files. # They do exist, and always link to the current scheme, however. return_list += [ 'iteration', 'niters', 'scheme', 'list_schemes', 'bin_labels', 'state_labels', 'west', 'trace' ] return sorted(set(return_list))