def __init__(self): super(WPDist, self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False
def __init__(self): super(WAssign, self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.data_reader = WESTDataReader() self.dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.binning = BinMappingComponent() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.states = []
def __init__(self): super(WPDist,self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False
def __init__(self): super(WAssign,self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.data_reader = WESTDataReader() self.dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.binning = BinMappingComponent() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.states = []
class WPDist(WESTParallelTool): prog = 'w_pdist' description = '''\ Calculate time-resolved, multi-dimensional probability distributions of WE datasets. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Histogram binning ----------------------------------------------------------------------------- By default, histograms are constructed with 100 bins in each dimension. This can be overridden by specifying -b/--bins, which accepts a number of different kinds of arguments: a single integer N N uniformly spaced bins will be used in each dimension. a sequence of integers N1,N2,... (comma-separated) N1 uniformly spaced bins will be used for the first dimension, N2 for the second, and so on. a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...] The bin boundaries B11, B12, B13, ... will be used for the first dimension, B21, B22, B23, ... for the second dimension, and so on. These bin boundaries need not be uniformly spaced. These expressions will be evaluated with Python's ``eval`` construct, with ``numpy`` available for use [e.g. to specify bins using numpy.arange()]. The first two forms (integer, list of integers) will trigger a scan of all data in each dimension in order to determine the minimum and maximum values, which may be very expensive for large datasets. This can be avoided by explicitly providing bin boundaries using the list-of-lists form. Note that these bins are *NOT* at all related to the bins used to drive WE sampling. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file produced (specified by -o/--output, defaulting to "pdist.h5") may be fed to plothist to generate plots (or appropriately processed text or HDF5 files) from this data. In short, the following datasets are created: ``histograms`` Normalized histograms. The first axis corresponds to iteration, and remaining axes correspond to dimensions of the input dataset. ``/binbounds_0`` Vector of bin boundaries for the first (index 0) dimension. Additional datasets similarly named (/binbounds_1, /binbounds_2, ...) are created for additional dimensions. ``/midpoints_0`` Vector of bin midpoints for the first (index 0) dimension. Additional datasets similarly named are created for additional dimensions. ``n_iter`` Vector of iteration numbers corresponding to the stored histograms (i.e. the first axis of the ``histograms`` dataset). ----------------------------------------------------------------------------- Subsequent processing ----------------------------------------------------------------------------- The output generated by this program (-o/--output, default "pdist.h5") may be plotted by the ``plothist`` program. See ``plothist --help`` for more information. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading of input data. Parallel processing is the default. For simple cases (reading pre-computed input data, modest numbers of segments), serial processing (--serial) may be more efficient. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WPDist, self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) parser.add_argument( '-b', '--bins', dest='bins', metavar='BINEXPR', default='100', help= '''Use BINEXPR for bins. This may be an integer, which will be used for each dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...]) which will use n1 bins for the first dimension, n2 for the second dimension, and so on; or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries for the second dimension, and so on. (Default: 100 bins in each dimension.)''' ) parser.add_argument( '-o', '--output', dest='output', default='pdist.h5', help='''Store results in OUTPUT (default: %(default)s).''') parser.add_argument( '-C', '--compress', action='store_true', help= '''Compress histograms. May make storage of higher-dimensional histograms more tractable, at the (possible extreme) expense of increased analysis time. (Default: no compression.)''') parser.add_argument( '--loose', dest='ignore_out_of_range', action='store_true', help= '''Ignore values that do not fall within bins. (Risky, as this can make buggy bin boundaries appear as reasonable data. Only use if you are sure of your bin boundary specification.)''') igroup = parser.add_argument_group( 'input dataset options').add_mutually_exclusive_group( required=False) igroup.add_argument( '--construct-dataset', help= '''Use the given function (as in module.function) to extract source data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id][timepoint][dimension]''') igroup.add_argument( '--dsspecs', nargs='+', metavar='DSSPEC', help= '''Construct probability distribution from one or more DSSPECs.''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) self.input_dssynth.h5filename = self.data_reader.we_h5filename self.input_dssynth.process_args(args) self.dsspec = self.input_dssynth.dsspec # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library # Open the WEST HDF5 file just long enough to process our iteration range, then close # and reopen in go() [which executes after the fork] with self.data_reader: self.iter_range.process_args(args) self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename, 'seg_index', slice=numpy.index_exp['weight']) self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False def go(self): self.data_reader.open('r') pi = self.progress.indicator pi.operation = 'Initializing' with pi: self.output_file = h5py.File(self.output_filename, 'w') h5io.stamp_creator_data(self.output_file) self.iter_start = self.iter_range.iter_start self.iter_stop = self.iter_range.iter_stop # Construct bin boundaries self.construct_bins(self.parse_binspec(self.binspec)) for idim, (binbounds, midpoints) in enumerate( zip(self.binbounds, self.midpoints)): self.output_file['binbounds_{}'.format(idim)] = binbounds self.output_file['midpoints_{}'.format(idim)] = midpoints # construct histogram self.construct_histogram() # Record iteration range iter_range = self.iter_range.iter_range() self.output_file['n_iter'] = iter_range self.iter_range.record_data_iter_range( self.output_file['histograms']) self.output_file.close() @staticmethod def parse_binspec(binspec): namespace = {'numpy': numpy, 'inf': float('inf')} try: binspec_compiled = eval(binspec, namespace) except Exception as e: raise ValueError('invalid bin specification: {!r}'.format(e)) else: if log.isEnabledFor(logging.DEBUG): log.debug('bin specs: {!r}'.format(binspec_compiled)) return binspec_compiled def construct_bins(self, bins): ''' Construct bins according to ``bins``, which may be: 1) A scalar integer (for that number of bins in each dimension) 2) A sequence of integers (specifying number of bins for each dimension) 3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension) Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins. ''' if not isiterable(bins): self._construct_bins_from_scalar(bins) elif not isiterable(bins[0]): self._construct_bins_from_int_seq(bins) else: self._construct_bins_from_bound_seqs(bins) if log.isEnabledFor(logging.DEBUG): log.debug('binbounds: {!r}'.format(self.binbounds)) def scan_data_shape(self): if self.ndim is None: dset = self.dsspec.get_iter_data(self.iter_start) self.ntimepoints = dset.shape[1] self.ndim = dset.shape[2] self.dset_dtype = dset.dtype def scan_data_range(self): '''Scan input data for range in each dimension. The number of dimensions is determined from the shape of the progress coordinate as of self.iter_start.''' self.progress.indicator.new_operation('Scanning for data range', self.iter_stop - self.iter_start) self.scan_data_shape() dset_dtype = self.dset_dtype ndim = self.ndim dsspec = self.dsspec try: minval = numpy.finfo(dset_dtype).min maxval = numpy.finfo(dset_dtype).max except ValueError: minval = numpy.iinfo(dset_dtype).min maxval = numpy.iinfo(dset_dtype).max data_range = self.data_range = [(maxval, minval) for _i in range(self.ndim)] #futures = [] #for n_iter in xrange(self.iter_start, self.iter_stop): #_remote_min_max(ndim, dset_dtype, n_iter, dsspec) # futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec))) #for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {}) for n_iter in range(self.iter_start, self.iter_stop)), self.max_queue_len): bounds = future.get_result(discard=True) for idim in range(ndim): current_min, current_max = data_range[idim] current_min = min(current_min, bounds[idim][0]) current_max = max(current_max, bounds[idim][1]) data_range[idim] = (current_min, current_max) self.progress.indicator.progress += 1 def _construct_bins_from_scalar(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = numpy.linspace(lb, ub, bins + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_int_seq(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = numpy.linspace(lb, ub, bins[idim] + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_bound_seqs(self, bins): self.binbounds = [] self.midpoints = [] for boundset in bins: boundset = numpy.asarray(boundset) if (numpy.diff(boundset) <= 0).any(): raise ValueError( 'boundary set {!r} is not strictly monotonically increasing' .format(boundset)) self.binbounds.append(boundset) self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0) def construct_histogram(self): '''Construct a histogram using bins previously constructed with ``construct_bins()``. The time series of histogram values is stored in ``histograms``. Each histogram in the time series is normalized.''' self.scan_data_shape() iter_count = self.iter_stop - self.iter_start histograms_ds = self.output_file.create_dataset( 'histograms', dtype=numpy.float64, shape=((iter_count, ) + tuple(len(bounds) - 1 for bounds in self.binbounds)), compression=9 if self.compress_output else None) binbounds = [ numpy.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds ] self.progress.indicator.new_operation('Constructing histograms', self.iter_stop - self.iter_start) task_gen = ( (_remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {}) for (iiter, n_iter) in enumerate(range(self.iter_start, self.iter_stop))) #futures = set() #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)): # initpoint = 1 if iiter > 0 else 0 # futures.add(self.work_manager.submit(_remote_bin_iter, # args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds))) #for future in self.work_manager.as_completed(futures): #future = self.work_manager.wait_any(futures) #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size): log.debug('max queue length: {!r}'.format(self.max_queue_len)) for future in self.work_manager.submit_as_completed( task_gen, self.max_queue_len): iiter, n_iter, iter_hist = future.get_result(discard=True) self.progress.indicator.progress += 1 # store histogram histograms_ds[iiter] = iter_hist del iter_hist, future
class WPDist(WESTParallelTool): prog='w_pdist' description = '''\ Calculate time-resolved, multi-dimensional probability distributions of WE datasets. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Histogram binning ----------------------------------------------------------------------------- By default, histograms are constructed with 100 bins in each dimension. This can be overridden by specifying -b/--bins, which accepts a number of different kinds of arguments: a single integer N N uniformly spaced bins will be used in each dimension. a sequence of integers N1,N2,... (comma-separated) N1 uniformly spaced bins will be used for the first dimension, N2 for the second, and so on. a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...] The bin boundaries B11, B12, B13, ... will be used for the first dimension, B21, B22, B23, ... for the second dimension, and so on. These bin boundaries need not be uniformly spaced. These expressions will be evaluated with Python's ``eval`` construct, with ``numpy`` available for use [e.g. to specify bins using numpy.arange()]. The first two forms (integer, list of integers) will trigger a scan of all data in each dimension in order to determine the minimum and maximum values, which may be very expensive for large datasets. This can be avoided by explicitly providing bin boundaries using the list-of-lists form. Note that these bins are *NOT* at all related to the bins used to drive WE sampling. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file produced (specified by -o/--output, defaulting to "pdist.h5") may be fed to plothist to generate plots (or appropriately processed text or HDF5 files) from this data. In short, the following datasets are created: ``histograms`` Normalized histograms. The first axis corresponds to iteration, and remaining axes correspond to dimensions of the input dataset. ``/binbounds_0`` Vector of bin boundaries for the first (index 0) dimension. Additional datasets similarly named (/binbounds_1, /binbounds_2, ...) are created for additional dimensions. ``/midpoints_0`` Vector of bin midpoints for the first (index 0) dimension. Additional datasets similarly named are created for additional dimensions. ``n_iter`` Vector of iteration numbers corresponding to the stored histograms (i.e. the first axis of the ``histograms`` dataset). ----------------------------------------------------------------------------- Subsequent processing ----------------------------------------------------------------------------- The output generated by this program (-o/--output, default "pdist.h5") may be plotted by the ``plothist`` program. See ``plothist --help`` for more information. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading of input data. Parallel processing is the default. For simple cases (reading pre-computed input data, modest numbers of segments), serial processing (--serial) may be more efficient. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WPDist,self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) parser.add_argument('-b', '--bins', dest='bins', metavar='BINEXPR', default='100', help='''Use BINEXPR for bins. This may be an integer, which will be used for each dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...]) which will use n1 bins for the first dimension, n2 for the second dimension, and so on; or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries for the second dimension, and so on. (Default: 100 bins in each dimension.)''') parser.add_argument('-o', '--output', dest='output', default='pdist.h5', help='''Store results in OUTPUT (default: %(default)s).''') parser.add_argument('-C', '--compress', action='store_true', help='''Compress histograms. May make storage of higher-dimensional histograms more tractable, at the (possible extreme) expense of increased analysis time. (Default: no compression.)''') parser.add_argument('--loose', dest='ignore_out_of_range', action='store_true', help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin boundaries appear as reasonable data. Only use if you are sure of your bin boundary specification.)''') igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False) igroup.add_argument('--construct-dataset', help='''Use the given function (as in module.function) to extract source data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id][timepoint][dimension]''') igroup.add_argument('--dsspecs', nargs='+', metavar='DSSPEC', help='''Construct probability distribution from one or more DSSPECs.''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) self.input_dssynth.h5filename = self.data_reader.we_h5filename self.input_dssynth.process_args(args) self.dsspec = self.input_dssynth.dsspec # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library # Open the WEST HDF5 file just long enough to process our iteration range, then close # and reopen in go() [which executes after the fork] with self.data_reader: self.iter_range.process_args(args) self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename, 'seg_index', slice=numpy.index_exp['weight']) self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False def go(self): self.data_reader.open('r') pi = self.progress.indicator pi.operation = 'Initializing' with pi: self.output_file = h5py.File(self.output_filename, 'w') h5io.stamp_creator_data(self.output_file) self.iter_start = self.iter_range.iter_start self.iter_stop = self.iter_range.iter_stop # Construct bin boundaries self.construct_bins(self.parse_binspec(self.binspec)) for idim, (binbounds, midpoints) in enumerate(izip(self.binbounds, self.midpoints)): self.output_file['binbounds_{}'.format(idim)] = binbounds self.output_file['midpoints_{}'.format(idim)] = midpoints # construct histogram self.construct_histogram() # Record iteration range iter_range = self.iter_range.iter_range() self.output_file['n_iter'] = iter_range self.iter_range.record_data_iter_range(self.output_file['histograms']) self.output_file.close() @staticmethod def parse_binspec(binspec): namespace = {'numpy': numpy, 'inf': float('inf')} try: binspec_compiled = eval(binspec,namespace) except Exception as e: raise ValueError('invalid bin specification: {!r}'.format(e)) else: if log.isEnabledFor(logging.DEBUG): log.debug('bin specs: {!r}'.format(binspec_compiled)) return binspec_compiled def construct_bins(self, bins): ''' Construct bins according to ``bins``, which may be: 1) A scalar integer (for that number of bins in each dimension) 2) A sequence of integers (specifying number of bins for each dimension) 3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension) Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins. ''' if not isiterable(bins): self._construct_bins_from_scalar(bins) elif not isiterable(bins[0]): self._construct_bins_from_int_seq(bins) else: self._construct_bins_from_bound_seqs(bins) if log.isEnabledFor(logging.DEBUG): log.debug('binbounds: {!r}'.format(self.binbounds)) def scan_data_shape(self): if self.ndim is None: dset = self.dsspec.get_iter_data(self.iter_start) self.ntimepoints = dset.shape[1] self.ndim = dset.shape[2] self.dset_dtype = dset.dtype def scan_data_range(self): '''Scan input data for range in each dimension. The number of dimensions is determined from the shape of the progress coordinate as of self.iter_start.''' self.progress.indicator.new_operation('Scanning for data range', self.iter_stop-self.iter_start) self.scan_data_shape() dset_dtype = self.dset_dtype ndim = self.ndim dsspec = self.dsspec try: minval = numpy.finfo(dset_dtype).min maxval = numpy.finfo(dset_dtype).max except ValueError: minval = numpy.iinfo(dset_dtype).min maxval = numpy.iinfo(dset_dtype).max data_range = self.data_range = [(maxval,minval) for _i in xrange(self.ndim)] #futures = [] #for n_iter in xrange(self.iter_start, self.iter_stop): #_remote_min_max(ndim, dset_dtype, n_iter, dsspec) # futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec))) #for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed(((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {}) for n_iter in xrange(self.iter_start, self.iter_stop)), self.max_queue_len): bounds = future.get_result(discard=True) for idim in xrange(ndim): current_min, current_max = data_range[idim] current_min = min(current_min, bounds[idim][0]) current_max = max(current_max, bounds[idim][1]) data_range[idim] = (current_min, current_max) self.progress.indicator.progress += 1 def _construct_bins_from_scalar(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in xrange(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = numpy.linspace(lb,ub,bins+1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_int_seq(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in xrange(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = numpy.linspace(lb,ub,bins[idim]+1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_bound_seqs(self, bins): self.binbounds = [] self.midpoints = [] for boundset in bins: boundset = numpy.asarray(boundset) if (numpy.diff(boundset) <= 0).any(): raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset)) self.binbounds.append(boundset) self.midpoints.append((boundset[:-1]+boundset[1:])/2.0) def construct_histogram(self): '''Construct a histogram using bins previously constructed with ``construct_bins()``. The time series of histogram values is stored in ``histograms``. Each histogram in the time series is normalized.''' self.scan_data_shape() iter_count = self.iter_stop - self.iter_start histograms_ds = self.output_file.create_dataset('histograms', dtype=numpy.float64, shape=((iter_count,) + tuple(len(bounds)-1 for bounds in self.binbounds)), compression=9 if self.compress_output else None) binbounds = [numpy.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds] self.progress.indicator.new_operation('Constructing histograms',self.iter_stop-self.iter_start) task_gen = ((_remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {}) for (iiter,n_iter) in enumerate(xrange(self.iter_start, self.iter_stop))) #futures = set() #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)): # initpoint = 1 if iiter > 0 else 0 # futures.add(self.work_manager.submit(_remote_bin_iter, # args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds))) #for future in self.work_manager.as_completed(futures): #future = self.work_manager.wait_any(futures) #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size): log.debug('max queue length: {!r}'.format(self.max_queue_len)) for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len): iiter, n_iter, iter_hist = future.get_result(discard=True) self.progress.indicator.progress += 1 # store histogram histograms_ds[iiter] = iter_hist del iter_hist, future
class WAssign(WESTParallelTool): prog = 'w_assign' description = '''\ Assign walkers to bins, producing a file (by default named "assign.h5") which can be used in subsequent analysis. For consistency in subsequent analysis operations, the entire dataset must be assigned, even if only a subset of the data will be used. This ensures that analyses that rely on tracing trajectories always know the originating bin of each trajectory. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Specifying macrostates ----------------------------------------------------------------------------- Optionally, kinetic macrostates may be defined in terms of sets of bins. Each trajectory will be labeled with the kinetic macrostate it was most recently in at each timepoint, for use in subsequent kinetic analysis. This is required for all kinetics analysis (w_kintrace and w_kinmat). There are three ways to specify macrostates: 1. States corresponding to single bins may be identified on the command line using the --states option, which takes multiple arguments, one for each state (separated by spaces in most shells). Each state is specified as a coordinate tuple, with an optional label prepended, as in ``bound:1.0`` or ``unbound:(2.5,2.5)``. Unlabeled states are named ``stateN``, where N is the (zero-based) position in the list of states supplied to --states. 2. States corresponding to multiple bins may use a YAML input file specified with --states-from-file. This file defines a list of states, each with a name and a list of coordinate tuples; bins containing these coordinates will be mapped to the containing state. For instance, the following file:: --- states: - label: unbound coords: - [9.0, 1.0] - [9.0, 2.0] - label: bound coords: - [0.1, 0.0] produces two macrostates: the first state is called "unbound" and consists of bins containing the (2-dimensional) progress coordinate values (9.0, 1.0) and (9.0, 2.0); the second state is called "bound" and consists of the single bin containing the point (0.1, 0.0). 3. Arbitrary state definitions may be supplied by a user-defined function, specified as --states-from-function=MODULE.FUNCTION. This function is called with the bin mapper as an argument (``function(mapper)``) and must return a list of dictionaries, one per state. Each dictionary must contain a vector of coordinate tuples with key "coords"; the bins into which each of these tuples falls define the state. An optional name for the state (with key "label") may also be provided. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "assign.h5") contains the following attributes datasets: ``nbins`` attribute *(Integer)* Number of valid bins. Bin assignments range from 0 to *nbins*-1, inclusive. ``nstates`` attribute *(Integer)* Number of valid macrostates (may be zero if no such states are specified). Trajectory ensemble assignments range from 0 to *nstates*-1, inclusive, when states are defined. ``/assignments`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint assignments (bin indices). ``/npts`` [iteration] *(Integer)* Number of timepoints in each iteration. ``/nsegs`` [iteration] *(Integer)* Number of segments in each iteration. ``/labeled_populations`` [iterations][state][bin] *(Floating-point)* Per-iteration and -timepoint bin populations, labeled by most recently visited macrostate. The last state entry (*nstates-1*) corresponds to trajectories initiated outside of a defined macrostate. ``/bin_labels`` [bin] *(String)* Text labels of bins. When macrostate assignments are given, the following additional datasets are present: ``/trajlabels`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint trajectory labels, indicating the macrostate which each trajectory last visited. ``/state_labels`` [state] *(String)* Labels of states. ``/state_map`` [bin] *(Integer)* Mapping of bin index to the macrostate containing that bin. An entry will contain *nbins+1* if that bin does not fall into a macrostate. Datasets indexed by state and bin contain one more entry than the number of valid states or bins. For *N* bins, axes indexed by bin are of size *N+1*, and entry *N* (0-based indexing) corresponds to a walker outside of the defined bin space (which will cause most mappers to raise an error). More importantly, for *M* states (including the case *M=0* where no states are specified), axes indexed by state are of size *M+1* and entry *M* refers to trajectories initiated in a region not corresponding to a defined macrostate. Thus, ``labeled_populations[:,:,:].sum(axis=1)[:,:-1]`` gives overall per-bin populations, for all defined bins and ``labeled_populations[:,:,:].sum(axis=2)[:,:-1]`` gives overall per-trajectory-ensemble populations for all defined states. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading/calculating input data. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WAssign, self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.data_reader = WESTDataReader() self.dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.binning = BinMappingComponent() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.states = [] self.subsample = False def add_args(self, parser): self.data_reader.add_args(parser) self.binning.add_args(parser) self.dssynth.add_args(parser) sgroup = parser.add_argument_group( 'macrostate definitions').add_mutually_exclusive_group() sgroup.add_argument( '--states', nargs='+', metavar='STATEDEF', help= '''Single-bin kinetic macrostate, specified by a coordinate tuple (e.g. '1.0' or '[1.0,1.0]'), optionally labeled (e.g. 'bound:[1.0,1.0]'). States corresponding to multiple bins must be specified with --states-from-file.''') sgroup.add_argument( '--states-from-file', metavar='STATEFILE', help= '''Load kinetic macrostates from the YAML file STATEFILE. See description above for the appropriate structure.''') sgroup.add_argument( '--states-from-function', metavar='STATEFUNC', help= '''Load kinetic macrostates from the function STATEFUNC, specified as module_name.func_name. This function is called with the bin mapper as an argument, and must return a list of dictionaries {'label': state_label, 'coords': 2d_array_like} one for each macrostate; the 'coords' entry must contain enough rows to identify all bins in the macrostate.''') agroup = parser.add_argument_group('other options') agroup.add_argument( '-o', '--output', dest='output', default='assign.h5', help='''Store results in OUTPUT (default: %(default)s).''') agroup.add_argument( '--subsample', dest='subsample', action='store_const', const=True, help='''Determines whether or not the data should be subsampled. This is rather useful for analysing steady state simulations.''' ) agroup.add_argument( '--config-from-file', dest='config_from_file', action='store_true', help= '''Load bins/macrostates from a scheme specified in west.cfg.''') agroup.add_argument('--scheme-name', dest='scheme', help='''Name of scheme specified in west.cfg.''') def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) # Necessary to open the file to get the current iteration # if we want to use the mapper in the file self.data_reader.open(mode='r+') self.n_iter = self.data_reader.current_iteration # If we decide to use this option for iteration selection: # getattr(args,'bins_from_h5file',None) or self.data_reader.current_iteration with self.data_reader: self.dssynth.h5filename = self.data_reader.we_h5filename self.dssynth.process_args(args) if args.config_from_file == False: self.binning.set_we_h5file_info(self.n_iter, self.data_reader) self.binning.process_args(args) self.output_filename = args.output if args.config_from_file: if not args.scheme: raise ValueError('A scheme must be specified.') else: self.load_config_from_west(args.scheme) elif args.states: self.parse_cmdline_states(args.states) elif args.states_from_file: self.load_state_file(args.states_from_file) elif args.states_from_function: self.load_states_from_function( get_object(args.states_from_function, path=['.'])) if self.states and len(self.states) < 2: raise ValueError('zero, two, or more macrostates are required') #self.output_file = WESTPAH5File(args.output, 'w', creating_program=True) log.debug('state list: {!r}'.format(self.states)) self.subsample = args.subsample if args.subsample is not None else False def parse_cmdline_states(self, state_strings): states = [] for istring, state_string in enumerate(state_strings): try: (label, coord_str) = state_string.split(':') except ValueError: label = 'state{}'.format(istring) coord_str = state_string coord = parse_pcoord_value(coord_str) states.append({'label': label, 'coords': coord}) self.states = states def load_config_from_west(self, scheme): try: config = westpa.rc.config['west']['analysis'] except: raise ValueError('There is no configuration file specified.') ystates = config['analysis_schemes'][scheme]['states'] self.states_from_dict(ystates) try: self.subsample = config['subsample'] except: pass from westpa._rc import bins_from_yaml_dict self.binning.mapper = bins_from_yaml_dict( config['analysis_schemes'][scheme]['bins'][0]) import os path = os.path.join(os.getcwd(), config['directory'], scheme) try: os.mkdir(config['directory']) os.mkdir(path) except: pass self.output_filename = os.path.join(path, 'assign.h5') def load_state_file(self, state_filename): import yaml ydict = yaml.load(open(state_filename, 'rt')) ystates = ydict['states'] self.states_from_dict(ystates) def states_from_dict(self, ystates): states = [] for istate, ystate in enumerate(ystates): state = {} state['label'] = ystate.get('label', 'state{}'.format(istate)) # coords can be: # - a scalar, in which case it is one bin, 1-D # - a single list, which is rejected as ambiguous # - a list of lists, which is a list of coordinate tuples coords = numpy.array(ystate['coords']) if coords.ndim == 0: coords.shape = (1, 1) elif coords.ndim == 1: raise ValueError( 'list {!r} is ambiguous (list of 1-d coordinates, or single multi-d coordinate?)' .format(ystate['coords'])) elif coords.ndim > 2: raise ValueError('coordinates must be 2-D') state['coords'] = coords states.append(state) self.states = states def load_states_from_function(self, statefunc): states = statefunc(self.binning.mapper) for istate, state in enumerate(states): state.setdefault('label', 'state{}'.format(istate)) try: state['coords'] = numpy.array(state['coords']) except KeyError: raise ValueError( 'state function {!r} returned a state {!r} without coordinates' .format(statefunc, state)) self.states = states log.debug('loaded states: {!r}'.format(self.states)) def assign_iteration(self, n_iter, nstates, nbins, state_map, last_labels): ''' Method to encapsulate the segment slicing (into n_worker slices) and parallel job submission Submits job(s), waits on completion, splices them back together Returns: assignments, trajlabels, pops for this iteration''' futures = [] iter_group = self.data_reader.get_iter_group(n_iter) nsegs, npts = iter_group['pcoord'].shape[:2] n_workers = self.work_manager.n_workers or 1 assignments = numpy.empty((nsegs, npts), dtype=index_dtype) trajlabels = numpy.empty((nsegs, npts), dtype=index_dtype) statelabels = numpy.empty((nsegs, npts), dtype=index_dtype) pops = numpy.zeros((nstates + 1, nbins + 1), dtype=weight_dtype) #Submit jobs to work manager blocksize = nsegs // n_workers if nsegs % n_workers > 0: blocksize += 1 def task_gen(): if __debug__: checkset = set() for lb in range(0, nsegs, blocksize): ub = min(nsegs, lb + blocksize) if __debug__: checkset.update(set(range(lb, ub))) args = () kwargs = dict( n_iter=n_iter, lb=lb, ub=ub, mapper=self.binning.mapper, nstates=nstates, state_map=state_map, last_labels=last_labels, parent_id_dsspec=self.data_reader.parent_id_dsspec, weight_dsspec=self.data_reader.weight_dsspec, pcoord_dsspec=self.dssynth.dsspec, subsample=self.subsample) yield (_assign_label_pop, args, kwargs) #futures.append(self.work_manager.submit(_assign_label_pop, #kwargs=) if __debug__: assert checkset == set( range(nsegs)), 'segments missing: {}'.format( set(range(nsegs)) - checkset) #for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( task_gen(), queue_size=self.max_queue_len): assign_slice, traj_slice, slice_pops, lb, ub, state_slice = future.get_result( discard=True) assignments[lb:ub, :] = assign_slice trajlabels[lb:ub, :] = traj_slice statelabels[lb:ub, :] = state_slice pops += slice_pops del assign_slice, traj_slice, slice_pops, state_slice del futures return (assignments, trajlabels, pops, statelabels) def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File( self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = numpy.empty((self.binning.mapper.nbins + 1, ), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [ numpy.string_(label) for label in self.binning.mapper.labels ] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [ numpy.string_(state['label']) for state in self.states ] for istate, sdict in enumerate(self.states): assert state_labels[istate] == numpy.string_( sdict['label']) #sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file[ 'state_labels'] = state_labels #+ ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates # Stamp if this has been subsampled. self.output_file.attrs['subsampled'] = self.subsample iter_count = iter_stop - iter_start nsegs = numpy.empty((iter_count, ), seg_id_dtype) npts = numpy.empty((iter_count, ), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation('Scanning for segment and point counts', iter_stop - iter_start) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count, max_nsegs, max_npts) assignments_dtype = numpy.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset( 'assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins) if self.states: trajlabel_dtype = numpy.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset( 'trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) statelabels_ds = self.output_file.create_dataset( 'statelabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) pops_shape = (iter_count, nstates + 1, nbins + 1) pops_ds = self.output_file.create_dataset( 'labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype)) h5io.label_axes( pops_ds, [numpy.string_(i) for i in ['iteration', 'state', 'bin']]) pi.new_operation('Assigning to bins', iter_stop - iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(range(iter_start, iter_stop)): #get iteration info in this block if iiter == 0: last_labels = numpy.empty((nsegs[iiter], ), index_dtype) last_labels[:] = nstates #unknown state #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops, statelabels = self.assign_iteration( n_iter, nstates, nbins, state_map, last_labels) ##Do stuff with this iteration's results last_labels = trajlabels[:, -1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels statelabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = statelabels pi.progress += 1 del assignments, trajlabels, pops, statelabels for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)
class WAssign(WESTParallelTool): prog='w_assign' description = '''\ Assign walkers to bins, producing a file (by default named "assign.h5") which can be used in subsequent analysis. For consistency in subsequent analysis operations, the entire dataset must be assigned, even if only a subset of the data will be used. This ensures that analyses that rely on tracing trajectories always know the originating bin of each trajectory. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Specifying macrostates ----------------------------------------------------------------------------- Optionally, kinetic macrostates may be defined in terms of sets of bins. Each trajectory will be labeled with the kinetic macrostate it was most recently in at each timepoint, for use in subsequent kinetic analysis. This is required for all kinetics analysis (w_kintrace and w_kinmat). There are three ways to specify macrostates: 1. States corresponding to single bins may be identified on the command line using the --states option, which takes multiple arguments, one for each state (separated by spaces in most shells). Each state is specified as a coordinate tuple, with an optional label prepended, as in ``bound:1.0`` or ``unbound:(2.5,2.5)``. Unlabeled states are named ``stateN``, where N is the (zero-based) position in the list of states supplied to --states. 2. States corresponding to multiple bins may use a YAML input file specified with --states-from-file. This file defines a list of states, each with a name and a list of coordinate tuples; bins containing these coordinates will be mapped to the containing state. For instance, the following file:: --- states: - label: unbound coords: - [9.0, 1.0] - [9.0, 2.0] - label: bound coords: - [0.1, 0.0] produces two macrostates: the first state is called "unbound" and consists of bins containing the (2-dimensional) progress coordinate values (9.0, 1.0) and (9.0, 2.0); the second state is called "bound" and consists of the single bin containing the point (0.1, 0.0). 3. Arbitrary state definitions may be supplied by a user-defined function, specified as --states-from-function=MODULE.FUNCTION. This function is called with the bin mapper as an argument (``function(mapper)``) and must return a list of dictionaries, one per state. Each dictionary must contain a vector of coordinate tuples with key "coords"; the bins into which each of these tuples falls define the state. An optional name for the state (with key "label") may also be provided. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "assign.h5") contains the following attributes datasets: ``nbins`` attribute *(Integer)* Number of valid bins. Bin assignments range from 0 to *nbins*-1, inclusive. ``nstates`` attribute *(Integer)* Number of valid macrostates (may be zero if no such states are specified). Trajectory ensemble assignments range from 0 to *nstates*-1, inclusive, when states are defined. ``/assignments`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint assignments (bin indices). ``/npts`` [iteration] *(Integer)* Number of timepoints in each iteration. ``/nsegs`` [iteration] *(Integer)* Number of segments in each iteration. ``/labeled_populations`` [iterations][state][bin] *(Floating-point)* Per-iteration and -timepoint bin populations, labeled by most recently visited macrostate. The last state entry (*nstates-1*) corresponds to trajectories initiated outside of a defined macrostate. ``/bin_labels`` [bin] *(String)* Text labels of bins. When macrostate assignments are given, the following additional datasets are present: ``/trajlabels`` [iteration][segment][timepoint] *(Integer)* Per-segment and -timepoint trajectory labels, indicating the macrostate which each trajectory last visited. ``/state_labels`` [state] *(String)* Labels of states. ``/state_map`` [bin] *(Integer)* Mapping of bin index to the macrostate containing that bin. An entry will contain *nbins+1* if that bin does not fall into a macrostate. Datasets indexed by state and bin contain one more entry than the number of valid states or bins. For *N* bins, axes indexed by bin are of size *N+1*, and entry *N* (0-based indexing) corresponds to a walker outside of the defined bin space (which will cause most mappers to raise an error). More importantly, for *M* states (including the case *M=0* where no states are specified), axes indexed by state are of size *M+1* and entry *M* refers to trajectories initiated in a region not corresponding to a defined macrostate. Thus, ``labeled_populations[:,:,:].sum(axis=1)[:,:-1]`` gives overall per-bin populations, for all defined bins and ``labeled_populations[:,:,:].sum(axis=2)[:,:-1]`` gives overall per-trajectory-ensemble populations for all defined states. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading/calculating input data. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WAssign,self).__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager self.data_reader = WESTDataReader() self.dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.binning = BinMappingComponent() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.states = [] def add_args(self, parser): self.data_reader.add_args(parser) self.binning.add_args(parser, suppress=['--bins-from-h5file']) self.dssynth.add_args(parser) sgroup = parser.add_argument_group('macrostate definitions').add_mutually_exclusive_group() sgroup.add_argument('--states', nargs='+', metavar='STATEDEF', help='''Single-bin kinetic macrostate, specified by a coordinate tuple (e.g. '1.0' or '[1.0,1.0]'), optionally labeled (e.g. 'bound:[1.0,1.0]'). States corresponding to multiple bins must be specified with --states-from-file.''') sgroup.add_argument('--states-from-file', metavar='STATEFILE', help='''Load kinetic macrostates from the YAML file STATEFILE. See description above for the appropriate structure.''') sgroup.add_argument('--states-from-function', metavar='STATEFUNC', help='''Load kinetic macrostates from the function STATEFUNC, specified as module_name.func_name. This function is called with the bin mapper as an argument, and must return a list of dictionaries {'label': state_label, 'coords': 2d_array_like} one for each macrostate; the 'coords' entry must contain enough rows to identify all bins in the macrostate.''') agroup = parser.add_argument_group('other options') agroup.add_argument('-o', '--output', dest='output', default='assign.h5', help='''Store results in OUTPUT (default: %(default)s).''') def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.dssynth.h5filename = self.data_reader.we_h5filename self.dssynth.process_args(args) self.binning.process_args(args) if args.states: self.parse_cmdline_states(args.states) elif args.states_from_file: self.load_state_file(args.states_from_file) elif args.states_from_function: self.load_states_from_function(get_object(args.states_from_function,path=['.'])) if self.states and len(self.states) < 2: raise ValueError('zero, two, or more macrostates are required') #self.output_file = WESTPAH5File(args.output, 'w', creating_program=True) self.output_filename = args.output log.debug('state list: {!r}'.format(self.states)) def parse_cmdline_states(self, state_strings): states = [] for istring, state_string in enumerate(state_strings): try: (label, coord_str) = state_string.split(':') except ValueError: label = 'state{}'.format(istring) coord_str = state_string coord = parse_pcoord_value(coord_str) states.append({'label': label, 'coords': coord}) self.states = states def load_state_file(self, state_filename): import yaml ydict = yaml.load(open(state_filename, 'rt')) ystates = ydict['states'] states = [] for istate, ystate in enumerate(ystates): state = {} state['label'] = ystate.get('label', 'state{}'.format(istate)) # coords can be: # - a scalar, in which case it is one bin, 1-D # - a single list, which is rejected as ambiguous # - a list of lists, which is a list of coordinate tuples coords = numpy.array(ystate['coords']) if coords.ndim == 0: coords.shape = (1,1) elif coords.ndim == 1: raise ValueError('list {!r} is ambiguous (list of 1-d coordinates, or single multi-d coordinate?)' .format(ystate['coords'])) elif coords.ndim > 2: raise ValueError('coordinates must be 2-D') state['coords'] = coords states.append(state) self.states = states def load_states_from_function(self, statefunc): states = statefunc(self.binning.mapper) for istate, state in enumerate(states): state.setdefault('label','state{}'.format(istate)) try: state['coords'] = numpy.array(state['coords']) except KeyError: raise ValueError('state function {!r} returned a state {!r} without coordinates'.format(statefunc,state)) self.states = states log.debug('loaded states: {!r}'.format(self.states)) def assign_iteration(self, n_iter, nstates, nbins, state_map, last_labels): ''' Method to encapsulate the segment slicing (into n_worker slices) and parallel job submission Submits job(s), waits on completion, splices them back together Returns: assignments, trajlabels, pops for this iteration''' futures = [] iter_group = self.data_reader.get_iter_group(n_iter) nsegs, npts = iter_group['pcoord'].shape[:2] n_workers = self.work_manager.n_workers or 1 assignments = numpy.empty((nsegs, npts), dtype=index_dtype) trajlabels = numpy.empty((nsegs, npts), dtype=index_dtype) pops = numpy.zeros((nstates+1,nbins+1), dtype=weight_dtype) #Submit jobs to work manager blocksize = nsegs // n_workers if nsegs % n_workers > 0: blocksize += 1 def task_gen(): if __debug__: checkset = set() for lb in xrange(0, nsegs, blocksize): ub = min(nsegs, lb+blocksize) if __debug__: checkset.update(set(xrange(lb,ub))) args = () kwargs = dict(n_iter=n_iter, lb=lb, ub=ub, mapper=self.binning.mapper, nstates=nstates, state_map=state_map, last_labels=last_labels, parent_id_dsspec=self.data_reader.parent_id_dsspec, weight_dsspec=self.data_reader.weight_dsspec, pcoord_dsspec=self.dssynth.dsspec) yield (_assign_label_pop, args, kwargs) #futures.append(self.work_manager.submit(_assign_label_pop, #kwargs=) if __debug__: assert checkset == set(xrange(nsegs)), 'segments missing: {}'.format(set(xrange(nsegs)) - checkset) #for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed(task_gen(), queue_size=self.max_queue_len): assign_slice, traj_slice, slice_pops, lb, ub = future.get_result(discard=True) assignments[lb:ub, :] = assign_slice trajlabels[lb:ub, :] = traj_slice pops += slice_pops del assign_slice, traj_slice, slice_pops del futures return (assignments, trajlabels, pops) def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File(self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = numpy.empty((self.binning.mapper.nbins+1,), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [label for label in self.binning.mapper.labels] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [state['label'] for state in self.states] for istate, sdict in enumerate(self.states): assert state_labels[istate] == sdict['label'] #sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file['state_labels'] = state_labels #+ ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates iter_count = iter_stop - iter_start nsegs = numpy.empty((iter_count,), seg_id_dtype) npts = numpy.empty((iter_count,), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation ('Scanning for segment and point counts', iter_stop-iter_start) for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count,max_nsegs,max_npts) assignments_dtype = numpy.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset('assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins) if self.states: trajlabel_dtype = numpy.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset('trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) pops_shape = (iter_count,nstates+1,nbins+1) pops_ds = self.output_file.create_dataset('labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype)) h5io.label_axes(pops_ds, ['iteration', 'state', 'bin']) pi.new_operation('Assigning to bins', iter_stop-iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)): #get iteration info in this block if iiter == 0: last_labels = numpy.empty((nsegs[iiter],), index_dtype) last_labels[:] = nstates #unknown state #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops = self.assign_iteration(n_iter, nstates, nbins, state_map, last_labels) ##Do stuff with this iteration's results last_labels = trajlabels[:,-1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels pi.progress += 1 del assignments, trajlabels, pops for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)