Пример #1
0
    def __init__(self):
        super(WPDist, self).__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None

        self.dsspec = None
        self.wt_dsspec = None  # dsspec for weights

        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension
        self.data_range = None  # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False
Пример #2
0
 def __init__(self):
     super(WBinTool, self).__init__()
     self.subcommand = None
     self.data_reader = WESTDataReader()
     self.binning = BinMappingComponent()
     self.args = None
     self.n_iter = None
Пример #3
0
class KineticsSubcommands(WESTSubcommand):
    '''Base class for common options for both kinetics schemes'''
    def __init__(self, parent):
        super(KineticsSubcommands, self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_file = None
        self.assignments_file = None

        self.do_compression = True

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                                (default: %(default)s).''')
        # default_kinetics_file will be picked up as a class attribute from the appropriate
        # subclass
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default=self.default_kinetics_file,
            help='''Store results in OUTPUT (default: %(default)s).''')
        iogroup.add_argument(
            '--no-compression',
            dest='compression',
            action='store_false',
            help=
            '''Do not store kinetics results compressed. This can increase disk
                             use about 100-fold, but can dramatically speed up subsequent analysis
                             for "w_kinavg matrix". Default: compress kinetics results.'''
        )
        self.progress.add_args(parser)
        parser.set_defaults(compression=True)

    def process_args(self, args):
        self.progress.process_args(args)
        self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.output_file = h5io.WESTPAH5File(args.output,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments do not span the requested iterations')
        self.do_compression = args.compression
Пример #4
0
 def __init__(self):
     super(WTraceTool,self).__init__()
     
     self.data_reader = WESTDataReader()
     #self.h5storage = HDF5Storage()
     self.output_file = None
     self.output_pattern = None
     self.endpoints = None
     self.datasets = []
Пример #5
0
    def __init__(self, parent):
        super(KineticsSubcommands, self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_file = None
        self.assignments_file = None

        self.do_compression = True
Пример #6
0
    def __init__(self):
        super(WNetworker, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_filename = None
        self.tm_filename = None
        self.postprocess_function = None
Пример #7
0
    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None
Пример #8
0
class WESTKineticsBase(WESTSubcommand):
    '''
    Common argument processing for w_direct/w_reweight subcommands.
    Mostly limited to handling input and output from w_assign.
    '''
    
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file,
                            help='''Store results in OUTPUT (default: %(default)s).''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
Пример #9
0
class WESTKineticsBase(WESTSubcommand):
    '''
    Common argument processing for w_direct/w_reweight subcommands.
    Mostly limited to handling input and output from w_assign.
    '''
    
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file,
                            help='''Store results in OUTPUT (default: %(default)s).''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
Пример #10
0
    def __init__(self):
        super(WSelectTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False
Пример #11
0
    def __init__(self):
        super(WNTopTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None
Пример #12
0
    def __init__(self):
        super(WAssign, self).__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        self.data_reader = WESTDataReader()
        self.dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.binning = BinMappingComponent()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.states = []
Пример #13
0
    def __init__(self):
        super(WIPI,self).__init__()
        self.data_reader = WESTDataReader()
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        self.progress = ProgressIndicatorComponent()

        self._iter = 1
        self.config_required = True
        self.version = "1.0B"
        # Set to matplotlib if you want that.  But why would you?
        # Well, whatever, we'll just set it to that for now.
        self.interface = 'matplotlib'
        self._scheme = None
        global iteration
Пример #14
0
    def __init__(self):
        super(WFluxanlTool, self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}

        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
Пример #15
0
    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None
Пример #16
0
 def __init__(self):
     super(WPDist,self).__init__()
     
     # Parallel processing by default (this is not actually necessary, but it is
     # informative!)
     self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
     
     # These are used throughout
     self.progress = ProgressIndicatorComponent()
     self.data_reader = WESTDataReader()
     self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
     self.iter_range = IterRangeSelection(self.data_reader)
     self.iter_range.include_args['iter_step'] = False
     self.binspec = None
     self.output_filename = None
     self.output_file = None
     
     
     self.dsspec = None
     self.wt_dsspec = None # dsspec for weights
     
     # These are used during histogram generation only
     self.iter_start = None
     self.iter_stop = None
     self.ndim = None
     self.ntimepoints = None
     self.dset_dtype = None
     self.binbounds = None  # bin boundaries for each dimension
     self.midpoints = None  # bin midpoints for each dimension 
     self.data_range = None # data range for each dimension, as the pairs (min,max)
     self.ignore_out_of_range = False
     self.compress_output = False
Пример #17
0
 def __init__(self):
     super(WBinTool,self).__init__()
     self.subcommand = None
     self.data_reader = WESTDataReader() 
     self.binning = BinMappingComponent()
     self.args = None
     self.n_iter = None
Пример #18
0
 def __init__(self, parent):
     super(KineticsSubcommands,self).__init__(parent)
     self.progress = ProgressIndicatorComponent()
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection() 
     self.output_file = None
     self.assignments_file = None
     
     self.do_compression = True
Пример #19
0
    def __init__(self):
        super(StateProbTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None

        self.output_file = None
        self.assignments_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
Пример #20
0
class KineticsSubcommands(WESTSubcommand):
    '''Base class for common options for both kinetics schemes'''
    
    def __init__(self, parent):
        super(KineticsSubcommands,self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection() 
        self.output_file = None
        self.assignments_file = None
        
        self.do_compression = True

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        
        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                             help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                                (default: %(default)s).''')
        # default_kinetics_file will be picked up as a class attribute from the appropriate
        # subclass
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_kinetics_file,
                             help='''Store results in OUTPUT (default: %(default)s).''')
        iogroup.add_argument('--no-compression', dest='compression', action='store_false',
                             help='''Do not store kinetics results compressed. This can increase disk
                             use about 100-fold, but can dramatically speed up subsequent analysis
                             for "w_kinavg matrix". Default: compress kinetics results.''')
        self.progress.add_args(parser)
        parser.set_defaults(compression=True)
        
    def process_args(self, args):
        self.progress.process_args(args)
        self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.output_file = h5io.WESTPAH5File(args.output, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments do not span the requested iterations')
        self.do_compression = args.compression
Пример #21
0
    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None
Пример #22
0
    def __init__(self):
        super(WSelectTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False
Пример #23
0
    def __init__(self):
        super(WNTopTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None
Пример #24
0
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
Пример #25
0
 def __init__(self):
     super(WAssign,self).__init__()
     
     # Parallel processing by default (this is not actually necessary, but it is
     # informative!)
     self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
     
     self.data_reader = WESTDataReader()
     self.dssynth = WESTDSSynthesizer(default_dsname='pcoord')
     self.binning = BinMappingComponent()
     self.progress = ProgressIndicatorComponent()
     self.output_file = None
     self.output_filename = None
     self.states = []
Пример #26
0
    def __init__(self):
        super(WFluxanlTool,self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}
        
        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
Пример #27
0
 def __init__(self):
     super(WPostAnalysisReweightTool, self).__init__()
     
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection()
     self.progress = ProgressIndicatorComponent()
     
     self.output_filename = None
     self.kinetics_filename = None
     self.assignment_filename = None
     
     self.output_file = None
     self.assignments_file = None
     self.kinetics_file = None
     
     self.evolution_mode = None
    def __init__(self):
        super(WPostanalysisPush, self).__init__()
        
        self.data_reader = WESTDataReader()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.rw_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.rw_file = None
        self.assignments_file = None

        self.weights_attributes_initialized = False
        self.weights_already_calculated = False
        self.time_average_scaling_vector_calculated = False
Пример #29
0
 def __init__(self):
     super(StateProbTool,self).__init__()
     
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection()
     self.progress = ProgressIndicatorComponent()
     
     self.output_filename = None
     self.kinetics_filename = None
     
     self.output_file = None
     self.assignments_file = None
     
     self.evolution_mode = None
     
     self.mcbs_alpha = None
     self.mcbs_acalpha = None
     self.mcbs_nsets = None
Пример #30
0
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
Пример #31
0
class WPDist(WESTParallelTool):
    prog='w_pdist'
    description = '''\
Calculate time-resolved, multi-dimensional probability distributions of WE 
datasets.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.
    
  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.
    
  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``numpy`` available for
    use [e.g. to specify bins using numpy.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.
    
  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.
    
  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.
    
  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''
    
    def __init__(self):
        super(WPDist,self).__init__()
        
        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        
        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None
        
        
        self.dsspec = None
        self.wt_dsspec = None # dsspec for weights
        
        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension 
        self.data_range = None # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False
        
    
    def add_args(self, parser):
        self.data_reader.add_args(parser)
         
        self.iter_range.add_args(parser)
                
        parser.add_argument('-b', '--bins', dest='bins', metavar='BINEXPR', default='100',
                            help='''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)''')
        
        parser.add_argument('-o', '--output', dest='output', default='pdist.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')
        parser.add_argument('-C', '--compress', action='store_true', 
                            help='''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''')
        
        parser.add_argument('--loose', dest='ignore_out_of_range', action='store_true',
                            help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''')
        
        igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False)

        igroup.add_argument('--construct-dataset',
                            help='''Use the given function (as in module.function) to extract source data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id][timepoint][dimension]''')
        
        igroup.add_argument('--dsspecs', nargs='+', metavar='DSSPEC',
                            help='''Construct probability distribution from one or more DSSPECs.''')
        
        self.progress.add_args(parser)
        
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        self.input_dssynth.h5filename = self.data_reader.we_h5filename
        self.input_dssynth.process_args(args)
        self.dsspec = self.input_dssynth.dsspec
        
        # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library
        # Open the WEST HDF5 file just long enough to process our iteration range, then close
        # and reopen in go() [which executes after the fork]
        with self.data_reader:
            self.iter_range.process_args(args)
        
        self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename, 'seg_index', slice=numpy.index_exp['weight'])
        
        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False
        
    
    def go(self):
        self.data_reader.open('r')
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)
            
            self.iter_start = self.iter_range.iter_start
            self.iter_stop = self.iter_range.iter_stop
    
            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(izip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints
    
            # construct histogram
            self.construct_histogram()
    
            # Record iteration range        
            iter_range = self.iter_range.iter_range()
            self.output_file['n_iter'] = iter_range
            self.iter_range.record_data_iter_range(self.output_file['histograms'])
            
            self.output_file.close()

    @staticmethod    
    def parse_binspec(binspec):
        namespace = {'numpy': numpy,
                     'inf': float('inf')}
                     
        try:
            binspec_compiled = eval(binspec,namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled
    
        
    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:
        
          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)
          
        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to 
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''
        
        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)
            
        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))
            
    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.dsspec.get_iter_data(self.iter_start)
            self.ntimepoints = dset.shape[1]
            self.ndim = dset.shape[2]
            self.dset_dtype = dset.dtype
        
            
    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''
        
        self.progress.indicator.new_operation('Scanning for data range', self.iter_stop-self.iter_start)
        self.scan_data_shape()
        
                
        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.dsspec
        
        try:
            minval = numpy.finfo(dset_dtype).min
            maxval = numpy.finfo(dset_dtype).max
        except ValueError:
            minval = numpy.iinfo(dset_dtype).min
            maxval = numpy.iinfo(dset_dtype).max
        
        data_range = self.data_range = [(maxval,minval) for _i in xrange(self.ndim)]

        #futures = []
        #for n_iter in xrange(self.iter_start, self.iter_stop):
            #_remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))
        
        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {})
                                                             for n_iter in xrange(self.iter_start, self.iter_stop)),
                                                            self.max_queue_len):
            bounds = future.get_result(discard=True)
            for idim in xrange(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()        

        self.binbounds = []
        self.midpoints = []        
        for idim in xrange(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch 
            # the maximum in the histogram
            ub *= 1.01
            
            boundset = numpy.linspace(lb,ub,bins+1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)
            
    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()        

        self.binbounds = []
        self.midpoints = []        
        for idim in xrange(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch 
            # the maximum in the histogram
            ub *= 1.01
            
            boundset = numpy.linspace(lb,ub,bins[idim]+1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)
               
    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = numpy.asarray(boundset)
            if (numpy.diff(boundset) <= 0).any():
                raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1]+boundset[1:])/2.0)
            
    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''
        
        self.scan_data_shape()
        
        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset('histograms', dtype=numpy.float64,
                                                        shape=((iter_count,) + tuple(len(bounds)-1 for bounds in self.binbounds)),
                                                        compression=9 if self.compress_output else None)
        binbounds = [numpy.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds]
        
        self.progress.indicator.new_operation('Constructing histograms',self.iter_stop-self.iter_start)
        task_gen = ((_remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds,
                                        self.ignore_out_of_range), {}) 
                    for (iiter,n_iter) in enumerate(xrange(self.iter_start, self.iter_stop)))
        #futures = set()
        #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))
        
        #for future in self.work_manager.as_completed(futures):
            #future = self.work_manager.wait_any(futures)
        #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future
Пример #32
0
class WPostAnalysisReweightTool(WESTTool):
    prog ='w_postanalysis_reweight'
    description = '''\
Calculate average rates from weighted ensemble data using the postanalysis
reweighting scheme. Bin assignments (usually "assignments.h5") and pre-calculated 
iteration flux matrices (usually "flux_matrices.h5") data files must have been 
previously generated using w_postanalysis_matrix.py (see "w_assign --help" and 
"w_kinetics --help" for information on generating these files).


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "kinrw.h5") contains the following
dataset:

  /state_prob_evolution [window,state]
    The reweighted state populations based on windows

  /color_prob_evolution [window,state]
    The reweighted populations last assigned to each state based on windows

  /bin_prob_evolution [window, bin]
    The reweighted populations of each bin based on windows. Bins contain
    one color each, so to recover the original un-colored spatial bins,
    one must sum over all states.

  /conditional_flux_evolution [window,state,state]
    (Structured -- see below). State-to-state fluxes based on windows of
    varying width
    
The structure of the final dataset is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None
        
        self.evolution_mode = None
        
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        iogroup.add_argument('-k', '--kinetics', default='flux_matrices.h5',
                            help='''Per-iteration flux matrices calculated by w_postanalysis_matrix 
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='kinrw.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked'], default='cumulative',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.''')
        cogroup.add_argument('--window-frac', type=float, default=1.0,
                             help='''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.''')

        cogroup.add_argument('--obs-threshold', type=int, default=1,
                             help='''The minimum number of observed transitions between two states i and j necessary to include
                             fluxes in the reweighting estimate''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError('kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics
                
        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError('Parameter error -- fractional window defined by --window-frac must be in (0,1]')
        self.obs_threshold = args.obs_threshold



    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.assignments_file.attrs['nstates']
            nbins = self.assignments_file.attrs['nbins']
            state_labels = self.assignments_file['state_labels'][...]
            state_map = self.assignments_file['state_map'][...]
            nfbins = self.kinetics_file.attrs['nrows']
            npts = self.kinetics_file.attrs['npts']

            assert nstates == len(state_labels)
            assert nfbins == nbins * nstates

            start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step

            start_pts = range(start_iter, stop_iter, step_iter)
            flux_evol = np.zeros((len(start_pts), nstates, nstates), dtype=ci_dtype)
            color_prob_evol = np.zeros((len(start_pts), nstates))
            state_prob_evol = np.zeros((len(start_pts), nstates))
            bin_prob_evol = np.zeros((len(start_pts), nfbins))
            pi.new_operation('Calculating flux evolution', len(start_pts))

            if self.evolution_mode == 'cumulative' and self.evol_window_frac == 1.0:
                print('Using fast streaming accumulation')

                total_fluxes = np.zeros((nfbins, nfbins), weight_dtype)
                total_obs = np.zeros((nfbins, nfbins), np.int64)

                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    stop = min(start + step_iter, stop_iter)

                    params = dict(start=start, stop=stop, nstates=nstates, nbins=nbins,
                                  state_labels=state_labels, state_map=state_map, nfbins=nfbins,
                                  total_fluxes=total_fluxes, total_obs=total_obs,
                                  h5file=self.kinetics_file, obs_threshold=self.obs_threshold)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(**params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][k,j] = rw_state_flux[k,j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k,j] = start
                            flux_evol[iblock]['iter_stop'][k,j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs


            else:
                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    
                    stop = min(start + step_iter, stop_iter)
                    if self.evolution_mode == 'cumulative':
                        windowsize = max(1, int(self.evol_window_frac * (stop - start_iter)))
                        block_start = max(start_iter, stop - windowsize)
                    else:   # self.evolution_mode == 'blocked'
                        block_start = start

                    params = dict(start=block_start, stop=stop, nstates=nstates, nbins=nbins,
                                  state_labels=state_labels, state_map=state_map, nfbins=nfbins,
                                  total_fluxes=None, total_obs=None,
                                  h5file=self.kinetics_file)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(**params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][k,j] = rw_state_flux[k,j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k,j] = start
                            flux_evol[iblock]['iter_stop'][k,j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs


            ds_flux_evol = self.output_file.create_dataset('conditional_flux_evolution', data=flux_evol, shuffle=True, compression=9)
            ds_state_prob_evol = self.output_file.create_dataset('state_prob_evolution', data=state_prob_evol, compression=9)
            ds_color_prob_evol = self.output_file.create_dataset('color_prob_evolution', data=color_prob_evol, compression=9)
            ds_bin_prob_evol = self.output_file.create_dataset('bin_prob_evolution', data=bin_prob_evol, compression=9)
            ds_state_labels = self.output_file.create_dataset('state_labels', data=state_labels)
Пример #33
0
class WCrawl(WESTParallelTool):
    prog='w_crawl'
    description = '''\
Crawl a weighted ensemble dataset, executing a function for each iteration.
This can be used for postprocessing of trajectories, cleanup of datasets,
or anything else that can be expressed as "do X for iteration N, then do
something with the result". Tasks are parallelized by iteration, and 
no guarantees are made about evaluation order.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        tgroup = parser.add_argument_group('task options')
        tgroup.add_argument('-c', '--crawler-instance',
                            help='''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of
                            WESTPACrawler to coordinate the calculation. Required only if initialization,
                            finalization, or task result processing is required.''')
        tgroup.add_argument('task_callable',
                            help='''Run TASK_CALLABLE (specified as module.function) on each iteration.
                            Required.''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        self.task_callable = get_object(args.task_callable, path=['.'])
        if args.crawler_instance is not None:
            self.crawler = get_object(args.crawler_instance, path=['.'])
        else:
            self.crawler = WESTPACrawler()

    def go(self):
        iter_start = self.iter_range.iter_start
        iter_stop = self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        self.data_reader.open('r')
        pi = self.progress.indicator
        with pi:
            pi.operation = 'Initializing'
            self.crawler.initialize(iter_start, iter_stop)

            try:
                pi.new_operation('Dispatching tasks & processing results', iter_count)
                task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in range(iter_start,iter_stop))
                for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
                    n_iter, result = future.get_result(discard=True)
                    if self.crawler is not None:
                        self.crawler.process_iter_result(n_iter,result)
                    pi.progress += 1
            finally:
                pi.new_operation('Finalizing')
                self.crawler.finalize()
Пример #34
0
class WSelectTool(WESTParallelTool):
    prog = 'w_select'
    description = '''\
Select dynamics segments matching various criteria. This requires a
user-provided prediate function. By default, only matching segments are
stored. If the -a/--include-ancestors option is given, then matching segments
and their ancestors will be stored.


-----------------------------------------------------------------------------
Predicate function
-----------------------------------------------------------------------------

Segments are selected based on a predicate function, which must be callable
as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs
matching the predicate in that iteration.

The predicate may be inverted by specifying the -v/--invert command-line
argument.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "select.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration]
    *(Integer)* Number of segment IDs matching the predicate (or inverted
    predicate, if -v/--invert is specified) in the given iteration.

  ``/seg_ids`` [iteration][segment]
    *(Integer)* Matching segments in each iteration. For an iteration
    ``n_iter``, only the first ``n_iter`` entries are valid. For example,
    the full list of matching seg_ids in the first stored iteration is
    ``seg_ids[0][:n_segs[0]]``.

  ``/weights`` [iteration][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WSelectTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-p',
            '--predicate-function',
            metavar='MODULE.FUNCTION',
            help=
            '''Use the given predicate function to match segments. This function
                             should take an iteration number and the HDF5 group corresponding to that
                             iteration and return a sequence of seg_ids matching the predicate, as in
                             ``match_predicate(n_iter, iter_group)``.''')
        sgroup.add_argument('-v',
                            '--invert',
                            dest='invert',
                            action='store_true',
                            help='''Invert the match predicate.''')
        sgroup.add_argument(
            '-a',
            '--include-ancestors',
            action='store_true',
            help='''Include ancestors of matched segments in output.''')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='select.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        predicate = get_object(args.predicate_function, path=['.'])
        if not callable(predicate):
            raise TypeError(
                'predicate object {!r} is not callable'.format(predicate))
        self.predicate = predicate
        self.invert = bool(args.invert)
        self.include_ancestors = bool(args.include_ancestors)
        self.output_filename = args.output

    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, ))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, 0),
                                                maxshape=(iter_count, None),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, 1000000),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            #             futures = set()
            #             for n_iter in xrange(iter_start,iter_stop):
            #                 futures.add(self.work_manager.submit(_find_matching_segments,
            #                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

            #             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(
                ((_find_matching_segments,
                  (self.data_reader.we_h5filename, n_iter, self.predicate,
                   self.invert), {})
                 for n_iter in range(iter_start, iter_stop)),
                    self.max_queue_len):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter - iter_start] = n_matches
                    matching_segs_ds[n_iter -
                                     iter_start, :n_matches] = matching_ids
                    weights_ds[n_iter - iter_start, :
                               n_matches] = self.data_reader.get_iter_group(
                                   n_iter)['seg_index']['weight'][sorted(
                                       matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments',
                                 extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in range(iter_stop - 1, iter_start - 1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(
                            matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter, :n_matches] = matching_ids
                        weights_ds[
                            iiter, :
                            n_matches] = self.data_reader.get_iter_group(
                                n_iter)['seg_index']['weight'][sorted(
                                    matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)[
                            'seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(
                            parent_id for parent_id in parent_ids
                            if parent_id >= 0)  # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
Пример #35
0
class WFluxanlTool(WESTTool):
    prog='w_fluxanl'
    description = '''\
Extract fluxes into pre-defined target states from WEST data,
average, and construct confidence intervals. Monte Carlo bootstrapping
is used to account for the correlated and possibly non-Gaussian statistical
error in flux measurements.

All non-graphical output (including that to the terminal and HDF5) assumes that
the propagation/resampling period ``tau`` is equal to unity; to obtain results
in familiar units, divide all fluxes and multiply all correlation lengths by
the true value of ``tau``.
'''
    
    output_format_version = 2

    def __init__(self):
        super(WFluxanlTool,self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}
        
        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
        
    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='fluxanl.h5',
                            help='Store intermediate data and analysis results to OUTPUT (default: %(default)s).')
        cgroup = parser.add_argument_group('calculation options')
        cgroup.add_argument('--disable-bootstrap', '-db', dest='bootstrap', action='store_const', const=False,
                             help='''Enable the use of Monte Carlo Block Bootstrapping.''')
        cgroup.add_argument('--disable-correl', '-dc', dest='correl', action='store_const', const=False,
                             help='''Disable the correlation analysis.''')
        cgroup.add_argument('-a', '--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval on the average flux'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('-N', '--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        cgroup.add_argument('--evol', action='store_true', dest='do_evol',
                            help='''Calculate time evolution of flux confidence intervals (expensive).''')
        cgroup.add_argument('--evol-step', type=int, default=1, metavar='ESTEP',
                            help='''Calculate time evolution of flux confidence intervals every ESTEP
                            iterations (default: %(default)s)''')
        
        
    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.iter_range.data_manager = self.data_reader
        self.iter_range.process_args(args)
        
        self.output_h5file = h5py.File(args.output, 'w')
        
        self.alpha = args.alpha
        # Disable the bootstrap or the correlation analysis.
        self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True
        self.do_correl = args.correl if args.correl is not None else True
        self.autocorrel_alpha = args.acalpha or self.alpha
        self.n_sets = args.nsets or mclib.get_bssize(self.alpha)
        
        self.do_evol = args.do_evol
        self.evol_step = args.evol_step or 1
                
    def calc_store_flux_data(self):         
        westpa.rc.pstatus('Calculating mean flux and confidence intervals for iterations [{},{})'
                        .format(self.iter_range.iter_start, self.iter_range.iter_stop))
        
        fluxdata = extract_fluxes(self.iter_range.iter_start, self.iter_range.iter_stop, self.data_reader)
        
        # Create a group to store data in
        output_group = h5io.create_hdf5_group(self.output_h5file, 'target_flux', replace=False, creating_program=self.prog)        
        self.output_group = output_group
        output_group.attrs['version_code'] = self.output_format_version
        self.iter_range.record_data_iter_range(output_group)
        
        n_targets = len(fluxdata)
        index = numpy.empty((len(fluxdata),), dtype=target_index_dtype)
        avg_fluxdata = numpy.empty((n_targets,), dtype=ci_dtype)
        

        for itarget, (target_label, target_fluxdata) in enumerate(fluxdata.iteritems()):
            # Create group and index entry
            index[itarget]['target_label'] = str(target_label)
            target_group = output_group.create_group('target_{}'.format(itarget))

            self.target_groups[target_label] = target_group
            
            # Store per-iteration values
            target_group['n_iter'] = target_fluxdata['n_iter']
            target_group['count'] = target_fluxdata['count']
            target_group['flux'] = target_fluxdata['flux']
            h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1'])
            
            
            # Calculate flux autocorrelation
            fluxes = target_fluxdata['flux']
            mean_flux = fluxes.mean()
            fmm = fluxes - mean_flux
            acorr = fftconvolve(fmm,fmm[::-1])
            acorr = acorr[len(acorr)//2:]
            acorr /= acorr[0]
            acorr_ds = target_group.create_dataset('flux_autocorrel', data=acorr)
            h5io.label_axes(acorr_ds, ['lag'], ['tau'])
            
            # Calculate overall averages and CIs
            #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
            #                                                     autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean)
            avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl({'dataset': fluxes}, estimator=(lambda stride, dataset: numpy.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets,
                                                                 autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable )
            avg_fluxdata[itarget] = (self.iter_range.iter_start, self.iter_range.iter_stop, avg, lb_ci, ub_ci, sterr, correl_len)
            westpa.rc.pstatus('target {!r}:'.format(target_label))
            westpa.rc.pstatus('  correlation length = {} tau'.format(correl_len))
            westpa.rc.pstatus('  mean flux and CI   = {:e} ({:e},{:e}) tau^(-1)'.format(avg,lb_ci,ub_ci))
            index[itarget]['mean_flux'] = avg
            index[itarget]['mean_flux_ci_lb'] = lb_ci
            index[itarget]['mean_flux_ci_ub'] = ub_ci
            index[itarget]['mean_flux_correl_len'] = correl_len

        # Write index and summary        
        index_ds = output_group.create_dataset('index', data=index)
        index_ds.attrs['mcbs_alpha'] = self.alpha
        index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
        index_ds.attrs['mcbs_n_sets'] = self.n_sets
        
        self.fluxdata = fluxdata
        self.output_h5file['avg_flux'] = avg_fluxdata
        
        
         
    def calc_evol_flux(self):
        westpa.rc.pstatus('Calculating cumulative evolution of flux confidence intervals every {} iteration(s)'
                        .format(self.evol_step))
        
        for itarget, (target_label, target_fluxdata) in enumerate(self.fluxdata.iteritems()):
            fluxes = target_fluxdata['flux']
            target_group = self.target_groups[target_label]
            iter_start = target_group['n_iter'][0]
            iter_stop  = target_group['n_iter'][-1]
            iter_count = iter_stop - iter_start
            n_blocks = iter_count // self.evol_step
            if iter_count % self.evol_step > 0: n_blocks += 1
            
            cis = numpy.empty((n_blocks,), dtype=ci_dtype)
            
            for iblock in xrange(n_blocks):
                block_iter_stop = min(iter_start + (iblock+1)*self.evol_step, iter_stop)
                istop = min((iblock+1)*self.evol_step, len(target_fluxdata['flux']))
                fluxes = target_fluxdata['flux'][:istop]
                
                #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
                #                                                     autocorrel_alpha = self.autocorrel_alpha,
                #                                                     subsample=numpy.mean)
                avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl({'dataset': fluxes}, estimator=(lambda stride, dataset: numpy.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets,
                                                                     autocorrel_alpha = self.autocorrel_alpha,
                                                                     subsample=numpy.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable )
                cis[iblock]['iter_start'] = iter_start
                cis[iblock]['iter_stop']  = block_iter_stop
                cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock]['ci_ubound'] = avg, ci_lb, ci_ub
                cis[iblock]['corr_len'] = correl_len
                cis[iblock]['sterr'] = sterr
                
                del fluxes

            cis_ds = target_group.create_dataset('flux_evolution', data=cis)
            cis_ds.attrs['iter_step'] = self.evol_step
            cis_ds.attrs['mcbs_alpha'] = self.alpha
            cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
            cis_ds.attrs['mcbs_n_sets'] = self.n_sets

        
    def go(self):
        self.calc_store_flux_data()
        if self.do_evol:
            self.calc_evol_flux()
Пример #36
0
class WTraceTool(WESTTool):
    prog='w_trace'
    description = '''\
Trace individual WEST trajectories and emit (or calculate) quantities along the
trajectory.

Trajectories are specified as N_ITER:SEG_ID pairs. Each segment is traced back
to its initial point, and then various quantities (notably n_iter and seg_id)
are printed in order from initial point up until the given segment in the given
iteration.

Output is stored in several files, all named according to the pattern given by
the -o/--output-pattern parameter. The default output pattern is "traj_%d_%d",
where the printf-style format codes are replaced by the iteration number and
segment ID of the terminal segment of the trajectory being traced.

Individual datasets can be selected for writing using the -d/--dataset option
(which may be specified more than once). The simplest form is ``-d dsname``,
which causes data from dataset ``dsname`` along the trace to be stored to
HDF5.  The dataset is assumed to be stored on a per-iteration basis, with
the first dimension corresponding to seg_id and the second dimension
corresponding to time within the segment.  Further options are specified
as comma-separated key=value pairs after the data set name, as in

    -d dsname,alias=newname,index=idsname,file=otherfile.h5,slice=[100,...]
    
The following options for datasets are supported:

    alias=newname
        When writing this data to HDF5 or text files, use ``newname``
        instead of ``dsname`` to identify the dataset. This is mostly of
        use in conjunction with the ``slice`` option in order, e.g., to
        retrieve two different slices of a dataset and store then with
        different names for future use.

    index=idsname
        The dataset is not stored on a per-iteration basis for all
        segments, but instead is stored as a single dataset whose
        first dimension indexes n_iter/seg_id pairs. The index to
        these n_iter/seg_id pairs is ``idsname``.
    
    file=otherfile.h5
        Instead of reading data from the main WEST HDF5 file (usually
        ``west.h5``), read data from ``otherfile.h5``.
        
    slice=[100,...]
        Retrieve only the given slice from the dataset. This can be
        used to pick a subset of interest to minimize I/O.
        
-------------------------------------------------------------------------------
'''


    pcoord_formats = {'u8': '%20d',
                      'i8': '%20d',
                      'u4': '%10d',
                      'i4': '%11d',
                      'u2': '%5d',
                      'i2': '%6d',
                      'f4': '%14.7g',
                      'f8': '%023.15g'}
    
    def __init__(self):
        super(WTraceTool,self).__init__()
        
        self.data_reader = WESTDataReader()
        #self.h5storage = HDF5Storage()
        self.output_file = None
        self.output_pattern = None
        self.endpoints = None
        self.datasets = []

        
    # Interface for command-line tools
    def add_args(self, parser):
        self.data_reader.add_args(parser)
        #self.h5storage.add_args(parser)
        parser.add_argument('-d', '--dataset', dest='datasets',
                            #this breaks argparse (see http://bugs.python.org/issue11874) 
                            #metavar='DSNAME[,alias=ALIAS][,index=INDEX][,file=FILE][,slice=SLICE]',
                            metavar='DSNAME',
                            action='append',
                            help='''Include the dataset named DSNAME in trace output. An extended form like
                            DSNAME[,alias=ALIAS][,index=INDEX][,file=FILE][,slice=SLICE] will
                            obtain the dataset from the given FILE instead of the main WEST HDF5 file,
                            slice it by SLICE, call it ALIAS in output, and/or access per-segment data by a n_iter,seg_id
                            INDEX instead of a seg_id indexed dataset in the group for n_iter.''')
        parser.add_argument('endpoints',  metavar='N_ITER:SEG_ID', nargs='+',
                            help='''Trace trajectory ending (or at least alive at) N_ITER:SEG_ID.''')
        
        #tgroup = parser.add_argument_group('trace options')
        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('--output-pattern', default='traj_%d_%d',
                            help='''Write per-trajectory data to output files/HDF5 groups whose names begin with OUTPUT_PATTERN,
                                 which must contain two printf-style format flags which will be replaced with the iteration number
                                 and segment ID of the terminal segment of the trajectory being traced.
                                 (Default: %(default)s.)''')
        ogroup.add_argument('-o', '--output', default='trajs.h5',
                            help='Store intermediate data and analysis results to OUTPUT (default: %(default)s).')
        
    
    def process_args(self, args):
        self.data_reader.process_args(args)
        #self.h5storage.process_args(args)
        self.endpoints = [map(long,endpoint.split(':')) for endpoint in args.endpoints]
        self.output_pattern = args.output_pattern
        
        for dsstr in args.datasets or []:
            self.datasets.append(self.parse_dataset_string(dsstr))        
        
        #self.h5storage.open_analysis_h5file()
        self.output_file = h5py.File(args.output)
        
    def parse_dataset_string(self, dsstr):
        dsinfo = {}

        r = re.compile(r',(?=[^\]]*(?:\[|$))')
        fields = r.split(dsstr)

        dsinfo['dsname'] = fields[0]

        for field in (field.strip() for field in fields[1:]):
            k,v = field.split('=')
            k = k.lower()
            if k in ('alias', 'file', 'index'):
                dsinfo[k] = v
            elif k == 'slice':
                try:
                    dsinfo['slice'] = eval('numpy.index_exp' + v)
                except SyntaxError:
                    raise SyntaxError('invalid index expression {!r}'.format(v))
            else:
                raise ValueError('invalid dataset option {!r}'.format(k))
            
        return dsinfo
    
    def go(self):
        self.data_reader.open('r')
        
        #Create a new 'trajectories' group if this is the first trace
        try:
            trajs_group = h5io.create_hdf5_group(self.output_file, 'trajectories', replace=False, creating_program=self.prog)
        except ValueError:
            trajs_group = self.output_file['trajectories']
        
        for n_iter, seg_id in self.endpoints:
            trajname = self.output_pattern % (n_iter,seg_id)
            trajgroup = trajs_group.create_group(trajname)

            trace = Trace.from_data_manager(n_iter,seg_id, self.data_reader.data_manager)
            
            with open(trajname + '_trace.txt', 'wt') as trace_output:
                self.emit_trace_text(trace, trace_output)
                
            self.emit_trace_h5(trace, trajgroup)
            
            aux_h5files = {}
            for dsinfo in self.datasets:
                dsname = dsinfo['dsname']
                filename = dsinfo.get('file')
                if filename:
                    try:
                        aux_h5file = aux_h5files[filename]
                    except KeyError:
                        aux_h5file = aux_h5files[filename] = h5py.File(filename, 'r')
                else:
                    aux_h5file = None
                    
                slice_ = dsinfo.get('slice')
                alias = dsinfo.get('alias', dsname)
                index = dsinfo.get('index')
                
                data, weights = trace.trace_timepoint_dataset(dsname, auxfile=aux_h5file, slice_=slice_,index_ds=index)
                
                # Save data to HDF5
                try:
                    del trajgroup[alias]
                except KeyError:
                    pass
                trajgroup[alias] = data
                
                # All weight vectors will be the same length, so only store in HDF5 once
                if not ('weights' in trajgroup and trajgroup['weights'].shape == weights.shape):
                    try:
                        del trajgroup['weights']
                    except KeyError:
                        pass    
                    trajgroup['weights'] = weights
                            
    def emit_trace_h5(self, trace, output_group):
        for dsname in ('basis_state', 'initial_state', 'segments'):
            try:
                del output_group[dsname]
            except KeyError:
                pass
        
        if trace.basis_state:
            output_group['basis_state'] = trace.basis_state.as_numpy_record()
        output_group['initial_state'] = trace.initial_state.as_numpy_record()
        output_group['segments'] = trace.summary
            
    def emit_trace_text(self, trace, output_file):
        '''Dump summary information about each segment in the given trace to the given output_file,
        which must be opened for writing in text mode.  Output columns are separated by at least
        one space.'''
        
        if not trace:
            return
                
        pcoord_ndim = trace[0]['final_pcoord'].shape[0]
        lastseg = trace[-1]
        len_n_iter = max(6, len(str(lastseg['n_iter'])))
        len_seg_id = max(6, max(len(str(seg_id)) for seg_id in trace['seg_id']))
        seg_pattern = '    '.join(['{n_iter:{len_n_iter}d}',
                                   '{seg_id:{len_seg_id}d}',
                                   '{weight:22.17e}',
                                   '{walltime:10.6g}',
                                   '{cputime:10.6g}',
                                   '{pcoord_str:s}'
                                   ]) + '\n'
                                  
        
        output_file.write('''\
# Trace of trajectory ending in n_iter:seg_id {n_iter:d}:{seg_id:d} (endpoint type {endpoint_type_text:s})   
# column  0: iteration (0 => initial state)
# column  1: seg_id (or initial state ID)
# column  2: weight
# column  3: wallclock time (s)
# column  4: CPU time (s)
'''.format(n_iter = long(lastseg['n_iter']), 
           seg_id = long(lastseg['seg_id']), 
           endpoint_type_text = Segment.endpoint_type_names[trace.endpoint_type]))
        
        
        if pcoord_ndim == 1:
            output_file.write('''\
# column  5: final progress coordinate value            
''')
        else:
            fpcbegin = 5
            fpcend = fpcbegin + pcoord_ndim - 1
            output_file.write('''\
# columns {fpcbegin:d} -- {fpcend:d}: final progress coordinate value
'''.format(fpcbegin=fpcbegin,fpcend=fpcend))
        
        
        pcoord_formats = self.pcoord_formats
        
        # Output row for initial state
        initial_state = trace.initial_state
        pcoord_str = '    '.join(pcoord_formats.get(pcfield.dtype.str[1:], '%s') % pcfield 
                                 for pcfield in initial_state.pcoord)
        output_file.write(seg_pattern.format(n_iter=0, seg_id=initial_state.state_id,
                                             weight=0.0, walltime=0, cputime=0, pcoord_str=pcoord_str,
                                             len_n_iter=len_n_iter,len_seg_id=len_seg_id))
        
        # Output rows for segments
        for segment in trace:            
            pcoord_str = '    '.join(pcoord_formats.get(pcfield.dtype.str[1:], '%s') % pcfield 
                                     for pcfield in segment['final_pcoord'])
            output_file.write(seg_pattern.format(n_iter = long(segment['n_iter']), 
                                                 seg_id = long(segment['seg_id']),
                                                 weight = float(segment['weight']),
                                                 walltime = float(segment['walltime']),
                                                 cputime = float(segment['cputime']), 
                                                 pcoord_str=pcoord_str,
                                                 len_n_iter=len_n_iter,
                                                 len_seg_id=len_seg_id))
Пример #37
0
class StateProbTool(WESTParallelTool):
    prog = 'w_stateprobs'
    description = '''\
Calculate average populations and associated errors in state populations from
weighted ensemble data. Bin assignments, including macrostate definitions,
are required. (See "w_assign --help" for more information).

-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "stateprobs.h5") contains the following
dataset:

  /avg_state_pops [state]
    (Structured -- see below) Population of each state across entire
    range specified.

If --evolution-mode is specified, then the following additional dataset is
available:

  /state_pop_evolution [window][state]
    (Structured -- see below). State populations based on windows of
    iterations of varying width.  If --evolution-mode=cumulative, then
    these windows all begin at the iteration specified with
    --start-iter and grow in length by --step-iter for each successive 
    element. If --evolution-mode=blocked, then these windows are all of
    width --step-iter (excluding the last, which may be shorter), the first
    of which begins at iteration --start-iter.
    
The structure of these datasets is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.
    
  ci_lbound
    (Floating-point) Lower bound of the confidence interval on the rate
    within this window, in units of inverse tau.
    
  ci_ubound
    (Floating-point) Upper bound of the confidence interval on the rate 
    within this window, in units of inverse tau.
    
  corr_len
    (Integer) Correlation length of the rate within this window, in units
    of tau.

Each of these datasets is also stamped with a number of attributes:

  mcbs_alpha
    (Floating-point) Alpha value of confidence intervals. (For example, 
    *alpha=0.05* corresponds to a 95% confidence interval.)

  mcbs_nsets
    (Integer) Number of bootstrap data sets used in generating confidence
    intervals.
    
  mcbs_acalpha
    (Floating-point) Alpha value for determining correlation lengths.
   

-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(StateProbTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None

        self.output_file = None
        self.assignments_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='stateprobs.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cgroup = parser.add_argument_group(
            'confidence interval calculation options')
        cgroup.add_argument('--alpha',
                            type=float,
                            default=0.05,
                            help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked', 'none'],
            default='none',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.'''
        )

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(
            self.mcbs_alpha)

        self.evolution_mode = args.evolution_mode

    def calc_state_pops(self):
        start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop
        nstates = self.nstates
        state_map = self.state_map
        iter_count = stop_iter - start_iter

        pi = self.progress.indicator
        pi.new_operation('Calculating state populations')
        pops = h5io.IterBlockedDataset(
            self.assignments_file['labeled_populations'])

        iter_state_pops = numpy.empty((nstates + 1, ), weight_dtype)
        all_state_pops = numpy.empty((iter_count, nstates + 1), weight_dtype)
        avg_state_pops = numpy.zeros((nstates + 1, ), weight_dtype)
        pops.cache_data(max_size='available')
        try:
            for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)):
                iter_state_pops.fill(0)
                labeled_pops = pops.iter_entry(n_iter)
                accumulate_state_populations_from_labeled(
                    labeled_pops,
                    state_map,
                    iter_state_pops,
                    check_state_map=False)
                all_state_pops[iiter] = iter_state_pops
                avg_state_pops += iter_state_pops
                del labeled_pops
                pi.progress += 1
        finally:
            pops.drop_cache()
        self.output_file.create_dataset('state_pops',
                                        data=all_state_pops,
                                        compression=9,
                                        shuffle=True)
        h5io.stamp_iter_range(self.output_file['state_pops'], start_iter,
                              stop_iter)

        self.all_state_pops = all_state_pops
        avg_state_pops = numpy.zeros((nstates + 1, ), ci_dtype)
        pi.new_operation('Calculating overall average populations and CIs',
                         nstates)

        #        futures = []
        #         for istate in xrange(nstates):
        #             futures.append(self.work_manager.submit(_eval_block,kwargs=dict(iblock=None,istate=istate,
        #                                                                             start=start_iter,stop=stop_iter,
        #                                                                             state_pops=all_state_pops[:,istate],
        #                                                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
        #                                                                             mcbs_acalpha = self.mcbs_acalpha)))
        #         for future in self.work_manager.as_completed(futures):
        def taskgen():
            for istate in xrange(nstates):
                yield (_eval_block, (),
                       dict(iblock=None,
                            istate=istate,
                            start=start_iter,
                            stop=stop_iter,
                            state_pops=all_state_pops[:, istate],
                            mcbs_alpha=self.mcbs_alpha,
                            mcbs_nsets=self.mcbs_nsets,
                            mcbs_acalpha=self.mcbs_acalpha))

        for future in self.work_manager.submit_as_completed(
                taskgen(), self.max_queue_len):
            (_iblock, istate, ci_res) = future.get_result(discard=True)
            avg_state_pops[istate] = ci_res
            pi.progress += 1
        self.output_file['avg_state_pops'] = avg_state_pops
        self.stamp_mcbs_info(self.output_file['avg_state_pops'])
        pi.clear()

        maxlabellen = max(map(len, self.state_labels))
        print('average state populations:')
        for istate in xrange(nstates):
            print(
                '{:{maxlabellen}s}: mean={:21.15e} CI=({:21.15e}, {:21.15e})'.
                format(self.state_labels[istate],
                       avg_state_pops['expected'][istate],
                       avg_state_pops['ci_lbound'][istate],
                       avg_state_pops['ci_ubound'][istate],
                       maxlabellen=maxlabellen))

    def calc_evolution(self):
        nstates = self.nstates
        start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step
        start_pts = range(start_iter, stop_iter, step_iter)

        pop_evol = numpy.zeros((len(start_pts), nstates), dtype=ci_dtype)

        pi = self.progress.indicator
        pi.new_operation('Calculating population evolution',
                         len(start_pts) * nstates)

        #         futures = []
        #         for iblock, start in enumerate(start_pts):
        #             if self.evolution_mode == 'cumulative':
        #                 block_start = start_iter
        #             else: # self.evolution_mode == 'blocked'
        #                 block_start = start
        #             stop = min(start+step_iter, stop_iter)
        #
        #             for istate in xrange(nstates):
        #                 future = self.work_manager.submit(_eval_block,kwargs=dict(iblock=iblock,istate=istate,
        #                                                                           start=block_start,stop=stop,
        #                                                                           state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
        #                                                                           mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
        #                                                                           mcbs_acalpha = self.mcbs_acalpha))
        #                 futures.append(future)
        def taskgen():
            for iblock, start in enumerate(start_pts):
                if self.evolution_mode == 'cumulative':
                    block_start = start_iter
                else:  # self.evolution_mode == 'blocked'
                    block_start = start
                stop = min(start + step_iter, stop_iter)

                for istate in xrange(nstates):
                    yield (_eval_block, (),
                           dict(
                               iblock=iblock,
                               istate=istate,
                               start=block_start,
                               stop=stop,
                               state_pops=self.all_state_pops[block_start -
                                                              start_iter:stop -
                                                              start_iter,
                                                              istate],
                               mcbs_alpha=self.mcbs_alpha,
                               mcbs_nsets=self.mcbs_nsets,
                               mcbs_acalpha=self.mcbs_acalpha))

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
                taskgen(), self.max_queue_len):
            (iblock, istate, ci_res) = future.get_result(discard=True)
            pop_evol[iblock, istate] = ci_res
            pi.progress += 1

        self.output_file.create_dataset('state_pop_evolution',
                                        data=pop_evol,
                                        shuffle=True,
                                        compression=9)
        pi.clear()

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.nstates = self.assignments_file.attrs['nstates']

            state_labels = self.state_labels = self.assignments_file[
                'state_labels'][...]
            state_map = self.state_map = self.assignments_file['state_map'][
                ...]
            if (state_map > nstates).any():
                raise ValueError('invalid state mapping')

            # copy metadata to output
            self.output_file.attrs['nstates'] = nstates
            self.output_file['state_labels'] = state_labels

            # calculate overall averages
            self.calc_state_pops()

            # calculate evolution, if requested
            if self.evolution_mode != 'none' and self.iter_range.iter_step:
                self.calc_evolution()
Пример #38
0
class WAssign(WESTParallelTool):
    prog='w_assign'
    description = '''\
Assign walkers to bins, producing a file (by default named "assign.h5")
which can be used in subsequent analysis.

For consistency in subsequent analysis operations, the entire dataset
must be assigned, even if only a subset of the data will be used. This
ensures that analyses that rely on tracing trajectories always know the
originating bin of each trajectory.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Specifying macrostates
-----------------------------------------------------------------------------

Optionally, kinetic macrostates may be defined in terms of sets of bins.
Each trajectory will be labeled with the kinetic macrostate it was most
recently in at each timepoint, for use in subsequent kinetic analysis.
This is required for all kinetics analysis (w_kintrace and w_kinmat).

There are three ways to specify macrostates:
  
  1. States corresponding to single bins may be identified on the command
     line using the --states option, which takes multiple arguments, one for
     each state (separated by spaces in most shells). Each state is specified
     as a coordinate tuple, with an optional label prepended, as in
     ``bound:1.0`` or ``unbound:(2.5,2.5)``. Unlabeled states are named
     ``stateN``, where N is the (zero-based) position in the list of states
     supplied to --states.
     
  2. States corresponding to multiple bins may use a YAML input file specified
     with --states-from-file. This file defines a list of states, each with a
     name and a list of coordinate tuples; bins containing these coordinates
     will be mapped to the containing state. For instance, the following
     file::

        ---
        states:
          - label: unbound
            coords:
              - [9.0, 1.0]
              - [9.0, 2.0]
          - label: bound
            coords:
              - [0.1, 0.0]

     produces two macrostates: the first state is called "unbound" and
     consists of bins containing the (2-dimensional) progress coordinate
     values (9.0, 1.0) and (9.0, 2.0); the second state is called "bound"
     and consists of the single bin containing the point (0.1, 0.0).
     
  3. Arbitrary state definitions may be supplied by a user-defined function,
     specified as --states-from-function=MODULE.FUNCTION. This function is
     called with the bin mapper as an argument (``function(mapper)``) and must
     return a list of dictionaries, one per state. Each dictionary must contain
     a vector of coordinate tuples with key "coords"; the bins into which each
     of these tuples falls define the state. An optional name for the state
     (with key "label") may also be provided.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "assign.h5") contains the following
attributes datasets:

  ``nbins`` attribute
    *(Integer)* Number of valid bins. Bin assignments range from 0 to
    *nbins*-1, inclusive.

  ``nstates`` attribute
    *(Integer)* Number of valid macrostates (may be zero if no such states are
    specified). Trajectory ensemble assignments range from 0 to *nstates*-1,
    inclusive, when states are defined.

  ``/assignments`` [iteration][segment][timepoint]
    *(Integer)* Per-segment and -timepoint assignments (bin indices).

  ``/npts`` [iteration]
    *(Integer)* Number of timepoints in each iteration.

  ``/nsegs`` [iteration]
    *(Integer)* Number of segments in each iteration.

  ``/labeled_populations`` [iterations][state][bin]
    *(Floating-point)* Per-iteration and -timepoint bin populations, labeled
    by most recently visited macrostate. The last state entry (*nstates-1*)
    corresponds to trajectories initiated outside of a defined macrostate.

  ``/bin_labels`` [bin]
    *(String)* Text labels of bins.

When macrostate assignments are given, the following additional datasets are
present:

  ``/trajlabels`` [iteration][segment][timepoint]
    *(Integer)* Per-segment and -timepoint trajectory labels, indicating the
    macrostate which each trajectory last visited.

  ``/state_labels`` [state]
    *(String)* Labels of states.

  ``/state_map`` [bin]
    *(Integer)* Mapping of bin index to the macrostate containing that bin.
    An entry will contain *nbins+1* if that bin does not fall into a 
    macrostate.
    
Datasets indexed by state and bin contain one more entry than the number of
valid states or bins. For *N* bins, axes indexed by bin are of size *N+1*, and
entry *N* (0-based indexing) corresponds to a walker outside of the defined bin
space (which will cause most mappers to raise an error). More importantly, for
*M* states (including the case *M=0* where no states are specified), axes
indexed by state are of size *M+1* and entry *M* refers to trajectories
initiated in a region not corresponding to a defined macrostate.

Thus, ``labeled_populations[:,:,:].sum(axis=1)[:,:-1]`` gives overall per-bin
populations, for all defined bins and 
``labeled_populations[:,:,:].sum(axis=2)[:,:-1]`` gives overall
per-trajectory-ensemble populations for all defined states. 

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading/calculating input
data.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''
    
    def __init__(self):
        super(WAssign,self).__init__()
        
        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        
        self.data_reader = WESTDataReader()
        self.dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.binning = BinMappingComponent()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.states = []
    
    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.binning.add_args(parser, suppress=['--bins-from-h5file'])
        self.dssynth.add_args(parser)
        
        sgroup = parser.add_argument_group('macrostate definitions').add_mutually_exclusive_group()
        sgroup.add_argument('--states', nargs='+', metavar='STATEDEF',
                            help='''Single-bin kinetic macrostate, specified by a coordinate tuple (e.g. '1.0' or '[1.0,1.0]'),
                            optionally labeled (e.g. 'bound:[1.0,1.0]'). States corresponding to multiple bins
                            must be specified with --states-from-file.''')
        sgroup.add_argument('--states-from-file', metavar='STATEFILE',
                            help='''Load kinetic macrostates from the YAML file STATEFILE. See description
                            above for the appropriate structure.''')
        sgroup.add_argument('--states-from-function', metavar='STATEFUNC',
                            help='''Load kinetic macrostates from the function STATEFUNC, specified as
                            module_name.func_name. This function is called with the bin mapper as an argument,
                            and must return a list of dictionaries {'label': state_label, 'coords': 2d_array_like}
                            one for each macrostate; the 'coords' entry must contain enough rows to identify all bins
                            in the macrostate.''')

        agroup = parser.add_argument_group('other options')
        agroup.add_argument('-o', '--output', dest='output', default='assign.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')


    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)

        with self.data_reader:
            self.dssynth.h5filename = self.data_reader.we_h5filename
            self.dssynth.process_args(args)
            self.binning.process_args(args)

        if args.states:
            self.parse_cmdline_states(args.states)
        elif args.states_from_file:
            self.load_state_file(args.states_from_file)
        elif args.states_from_function:
            self.load_states_from_function(get_object(args.states_from_function,path=['.']))

        if self.states and len(self.states) < 2:
            raise ValueError('zero, two, or more macrostates are required')

        #self.output_file = WESTPAH5File(args.output, 'w', creating_program=True)
        self.output_filename = args.output
        log.debug('state list: {!r}'.format(self.states))

    def parse_cmdline_states(self, state_strings):
        states = []
        for istring, state_string in enumerate(state_strings):
            try:
                (label, coord_str) = state_string.split(':')
            except ValueError:
                label = 'state{}'.format(istring)
                coord_str = state_string
            coord = parse_pcoord_value(coord_str)
            states.append({'label': label, 'coords': coord})
        self.states = states

    def load_state_file(self, state_filename):
        import yaml
        ydict = yaml.load(open(state_filename, 'rt'))
        ystates = ydict['states']
        
        states = []
        for istate, ystate in enumerate(ystates):
            state = {}
            state['label'] = ystate.get('label', 'state{}'.format(istate))
            # coords can be:
            #  - a scalar, in which case it is one bin, 1-D
            #  - a single list, which is rejected as ambiguous
            #  - a list of lists, which is a list of coordinate tuples
            coords = numpy.array(ystate['coords'])
            if coords.ndim == 0:
                coords.shape = (1,1)
            elif coords.ndim == 1:
                raise ValueError('list {!r} is ambiguous (list of 1-d coordinates, or single multi-d coordinate?)'
                                 .format(ystate['coords']))
            elif coords.ndim > 2:
                raise ValueError('coordinates must be 2-D')
            state['coords'] = coords
            states.append(state)
        self.states = states

    def load_states_from_function(self, statefunc):
        states = statefunc(self.binning.mapper)
        for istate, state in enumerate(states):
            state.setdefault('label','state{}'.format(istate))
            try:
                state['coords'] = numpy.array(state['coords'])
            except KeyError:
                raise ValueError('state function {!r} returned a state {!r} without coordinates'.format(statefunc,state))
        self.states = states
        log.debug('loaded states: {!r}'.format(self.states))


    def assign_iteration(self, n_iter, nstates, nbins, state_map, last_labels):
        ''' Method to encapsulate the segment slicing (into n_worker slices) and parallel job submission
            Submits job(s), waits on completion, splices them back together
            Returns: assignments, trajlabels, pops for this iteration'''

        futures = []

        iter_group = self.data_reader.get_iter_group(n_iter)
        nsegs, npts = iter_group['pcoord'].shape[:2]
        n_workers = self.work_manager.n_workers or 1
        assignments = numpy.empty((nsegs, npts), dtype=index_dtype)
        trajlabels = numpy.empty((nsegs, npts), dtype=index_dtype)
        pops = numpy.zeros((nstates+1,nbins+1), dtype=weight_dtype)

        #Submit jobs to work manager
        blocksize = nsegs // n_workers
        if nsegs % n_workers > 0:
            blocksize += 1

        def task_gen():
            if __debug__:
                checkset = set()
            for lb in xrange(0, nsegs, blocksize):
                ub = min(nsegs, lb+blocksize)
                if __debug__:
                    checkset.update(set(xrange(lb,ub)))
                args = ()
                kwargs = dict(n_iter=n_iter,
                              lb=lb, ub=ub, mapper=self.binning.mapper, nstates=nstates, state_map=state_map,
                              last_labels=last_labels, 
                              parent_id_dsspec=self.data_reader.parent_id_dsspec, 
                              weight_dsspec=self.data_reader.weight_dsspec,
                              pcoord_dsspec=self.dssynth.dsspec)
                yield (_assign_label_pop, args, kwargs)

                #futures.append(self.work_manager.submit(_assign_label_pop, 
                #kwargs=)
            if __debug__:
                assert checkset == set(xrange(nsegs)), 'segments missing: {}'.format(set(xrange(nsegs)) - checkset)

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(task_gen(), queue_size=self.max_queue_len):
            assign_slice, traj_slice, slice_pops, lb, ub = future.get_result(discard=True)
            assignments[lb:ub, :] = assign_slice
            trajlabels[lb:ub, :] = traj_slice
            pops += slice_pops
            del assign_slice, traj_slice, slice_pops

        del futures
        return (assignments, trajlabels, pops)

    def go(self):
        assert self.data_reader.parent_id_dsspec._h5file is None
        assert self.data_reader.weight_dsspec._h5file is None
        if hasattr(self.dssynth.dsspec, '_h5file'):
            assert self.dssynth.dsspec._h5file is None
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi, self.data_reader, WESTPAH5File(self.output_filename, 'w', creating_program=True) as self.output_file:
            assign = self.binning.mapper.assign

            # We always assign the entire simulation, so that no trajectory appears to start
            # in a transition region that doesn't get initialized in one.
            iter_start = 1 
            iter_stop =  self.data_reader.current_iteration

            h5io.stamp_iter_range(self.output_file, iter_start, iter_stop)

            nbins = self.binning.mapper.nbins
            self.output_file.attrs['nbins'] = nbins 

            state_map = numpy.empty((self.binning.mapper.nbins+1,), index_dtype)
            state_map[:] = 0 # state_id == nstates => unknown state

            # Recursive mappers produce a generator rather than a list of labels
            # so consume the entire generator into a list
            labels = [label for label in self.binning.mapper.labels]

            self.output_file.create_dataset('bin_labels', data=labels, compression=9)

            if self.states:
                nstates = len(self.states)
                state_map[:] = nstates # state_id == nstates => unknown state
                state_labels = [state['label'] for state in self.states]

                for istate, sdict in enumerate(self.states):
                    assert state_labels[istate] == sdict['label'] #sanity check
                    state_assignments = assign(sdict['coords'])
                    for assignment in state_assignments:
                        state_map[assignment] = istate
                self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True)
                self.output_file['state_labels'] = state_labels #+ ['(unknown)']
            else:
                nstates = 0
            self.output_file.attrs['nstates'] = nstates

            iter_count = iter_stop - iter_start
            nsegs = numpy.empty((iter_count,), seg_id_dtype)
            npts = numpy.empty((iter_count,), seg_id_dtype)

            # scan for largest number of segments and largest number of points
            pi.new_operation ('Scanning for segment and point counts', iter_stop-iter_start)
            for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)):
                iter_group = self.data_reader.get_iter_group(n_iter)
                nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2]
                pi.progress += 1
                del iter_group

            pi.new_operation('Preparing output')

            # create datasets
            self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9)
            self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9)

            max_nsegs = nsegs.max()
            max_npts = npts.max()

            assignments_shape = (iter_count,max_nsegs,max_npts)
            assignments_dtype = numpy.min_scalar_type(nbins)
            assignments_ds = self.output_file.create_dataset('assignments', dtype=assignments_dtype, shape=assignments_shape,
                                                             compression=4, shuffle=True,
                                                             chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype),
                                                             fillvalue=nbins)
            if self.states:
                trajlabel_dtype = numpy.min_scalar_type(nstates)
                trajlabels_ds = self.output_file.create_dataset('trajlabels', dtype=trajlabel_dtype, shape=assignments_shape,
                                                                compression=4, shuffle=True,
                                                                chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype),
                                                                fillvalue=nstates)

            pops_shape = (iter_count,nstates+1,nbins+1)
            pops_ds = self.output_file.create_dataset('labeled_populations', dtype=weight_dtype, shape=pops_shape,
                                                      compression=4, shuffle=True,
                                                      chunks=h5io.calc_chunksize(pops_shape, weight_dtype))
            h5io.label_axes(pops_ds, ['iteration', 'state', 'bin'])

            pi.new_operation('Assigning to bins', iter_stop-iter_start)
            last_labels = None # mapping of seg_id to last macrostate inhabited      
            for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)):
                #get iteration info in this block

                if iiter == 0:
                    last_labels = numpy.empty((nsegs[iiter],), index_dtype)
                    last_labels[:] = nstates #unknown state

                #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together
                assignments, trajlabels, pops = self.assign_iteration(n_iter, nstates, nbins, state_map, last_labels)

                ##Do stuff with this iteration's results

                last_labels = trajlabels[:,-1].copy()
                assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments
                pops_ds[iiter] = pops
                if self.states:
                    trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]]  = trajlabels

                pi.progress += 1
                del assignments, trajlabels, pops

            for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations':
                h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)
Пример #39
0
class WNTopTool(WESTTool):
    prog = 'w_ntop'
    description = '''\
Select walkers from bins . An assignment file mapping walkers to
bins at each timepoint is required (see``w_assign --help`` for further 
information on generating this file). By default, high-weight walkers are
selected (hence the name ``w_ntop``: select the N top-weighted walkers from
each bin); however, minimum weight walkers and randomly-selected walkers
may be selected instead.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "ntop.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration][bin]
    *(Integer)* Number of segments in each bin/state in the given iteration.
    This will generally be the same as the number requested with
    ``--n/--count`` but may be smaller if the requested number of walkers
    does not exist.

  ``/seg_ids`` [iteration][bin][segment]
    *(Integer)* Matching segments in each iteration for each bin.
    For an iteration ``n_iter``, only the first ``n_iter`` entries are
    valid. For example, the full list of matching seg_ids in bin 0 in the 
    first stored iteration is ``seg_ids[0][0][:n_segs[0]]``.

  ``/weights`` [iteration][bin][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WNTopTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        igroup = parser.add_argument_group('input options')
        igroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help=
            '''Use assignments from the given ASSIGNMENTS file (default: %(default)s).'''
        )

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-n',
            '--count',
            type=int,
            default=1,
            help=
            '''Select COUNT walkers from each iteration for each bin (default: %(default)s).'''
        )
        sgroup.add_argument(
            '-t',
            '--timepoint',
            type=int,
            default=-1,
            help=
            '''Base selection on the given TIMEPOINT within each iteration. Default (-1)
                            corresponds to the last timepoint.''')
        cgroup = parser.add_mutually_exclusive_group()
        cgroup.add_argument(
            '--highweight',
            dest='select_what',
            action='store_const',
            const='highweight',
            help='''Select COUNT highest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--lowweight',
            dest='select_what',
            action='store_const',
            const='lowweight',
            help='''Select COUNT lowest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--random',
            dest='select_what',
            action='store_const',
            const='random',
            help='''Select COUNT walkers randomly from each bin.''')
        parser.set_defaults(select_what='highweight')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='ntop.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.what = args.select_what
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.count = args.count
        self.timepoint = args.timepoint

    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins'] + 1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(
            assignments_file['nsegs'], iter_start, iter_stop)]

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))

        seg_count_ds = output_file.create_dataset('nsegs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, nbins))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, nbins, count),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, nbins, count),
                                       seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, nbins,
                                                       count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, nbins, count),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                assignments = numpy.require(assignments_ds[
                    h5io.get_iteration_entry(assignments_ds, n_iter) +
                    numpy.index_exp[:, timepoint]],
                                            dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(
                    n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(
                    nsegs[iiter], nbins, assignments)
                for ibin in range(nbins):
                    segs = numpy.nonzero(segs_by_bin[:, ibin])[0]

                    seg_count_ds[iiter, ibin] = min(len(segs), count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = numpy.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = numpy.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = numpy.random.permutation(len(weights))

                        matching_segs_ds[iiter,
                                         ibin, :len(segs)] = segs.take(indices)
                        weights_ds[iiter,
                                   ibin, :len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
Пример #40
0
class StateProbTool(WESTParallelTool):
    prog='w_stateprobs'
    description = '''\
Calculate average populations and associated errors in state populations from
weighted ensemble data. Bin assignments, including macrostate definitions,
are required. (See "w_assign --help" for more information).

-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "stateprobs.h5") contains the following
dataset:

  /avg_state_pops [state]
    (Structured -- see below) Population of each state across entire
    range specified.

If --evolution-mode is specified, then the following additional dataset is
available:

  /state_pop_evolution [window][state]
    (Structured -- see below). State populations based on windows of
    iterations of varying width.  If --evolution-mode=cumulative, then
    these windows all begin at the iteration specified with
    --start-iter and grow in length by --step-iter for each successive 
    element. If --evolution-mode=blocked, then these windows are all of
    width --step-iter (excluding the last, which may be shorter), the first
    of which begins at iteration --start-iter.
    
The structure of these datasets is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.
    
  ci_lbound
    (Floating-point) Lower bound of the confidence interval on the rate
    within this window, in units of inverse tau.
    
  ci_ubound
    (Floating-point) Upper bound of the confidence interval on the rate 
    within this window, in units of inverse tau.
    
  corr_len
    (Integer) Correlation length of the rate within this window, in units
    of tau.

Each of these datasets is also stamped with a number of attributes:

  mcbs_alpha
    (Floating-point) Alpha value of confidence intervals. (For example, 
    *alpha=0.05* corresponds to a 95% confidence interval.)

  mcbs_nsets
    (Integer) Number of bootstrap data sets used in generating confidence
    intervals.
    
  mcbs_acalpha
    (Floating-point) Alpha value for determining correlation lengths.
   

-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''    
    
    def __init__(self):
        super(StateProbTool,self).__init__()
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
        
    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='stateprobs.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        
        cgroup = parser.add_argument_group('confidence interval calculation options')
        cgroup.add_argument('--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        
        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked', 'none'], default='none',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

    
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(self.mcbs_alpha)
        
        self.evolution_mode = args.evolution_mode
        
    def calc_state_pops(self):
        start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop
        nstates = self.nstates
        state_map = self.state_map
        iter_count = stop_iter-start_iter
        
        pi = self.progress.indicator
        pi.new_operation('Calculating state populations')
        pops = h5io.IterBlockedDataset(self.assignments_file['labeled_populations'])
        
        iter_state_pops = numpy.empty((nstates+1,), weight_dtype)
        all_state_pops = numpy.empty((iter_count,nstates+1), weight_dtype)
        avg_state_pops = numpy.zeros((nstates+1,), weight_dtype)
        pops.cache_data(max_size='available')
        try:
            for iiter,n_iter in enumerate(xrange(start_iter,stop_iter)):
                iter_state_pops.fill(0)
                labeled_pops = pops.iter_entry(n_iter)
                accumulate_state_populations_from_labeled(labeled_pops, state_map, iter_state_pops, check_state_map=False)
                all_state_pops[iiter] = iter_state_pops
                avg_state_pops += iter_state_pops
                del labeled_pops
                pi.progress += 1
        finally:
            pops.drop_cache()
        self.output_file.create_dataset('state_pops', data=all_state_pops, compression=9, shuffle=True)
        h5io.stamp_iter_range(self.output_file['state_pops'], start_iter, stop_iter)
        
        self.all_state_pops = all_state_pops
        avg_state_pops = numpy.zeros((nstates+1,), ci_dtype)
        pi.new_operation('Calculating overall average populations and CIs', nstates)
#        futures = []
#         for istate in xrange(nstates):
#             futures.append(self.work_manager.submit(_eval_block,kwargs=dict(iblock=None,istate=istate,
#                                                                             start=start_iter,stop=stop_iter,
#                                                                             state_pops=all_state_pops[:,istate],
#                                                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
#                                                                             mcbs_acalpha = self.mcbs_acalpha)))
#         for future in self.work_manager.as_completed(futures):
        def taskgen():
            for istate in xrange(nstates):
                yield (_eval_block, (), dict(iblock=None,istate=istate,
                                             start=start_iter,stop=stop_iter,
                                             state_pops=all_state_pops[:,istate],
                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
                                             mcbs_acalpha = self.mcbs_acalpha))
        for future in self.work_manager.submit_as_completed(taskgen(), self.max_queue_len):
            (_iblock,istate,ci_res) = future.get_result(discard=True)
            avg_state_pops[istate] = ci_res
            pi.progress += 1
        self.output_file['avg_state_pops'] = avg_state_pops
        self.stamp_mcbs_info(self.output_file['avg_state_pops'])
        pi.clear()
        
        maxlabellen = max(map(len,self.state_labels))
        print('average state populations:')
        for istate in xrange(nstates):
            print('{:{maxlabellen}s}: mean={:21.15e} CI=({:21.15e}, {:21.15e})'
                  .format(self.state_labels[istate],
                          avg_state_pops['expected'][istate],
                          avg_state_pops['ci_lbound'][istate],
                          avg_state_pops['ci_ubound'][istate],
                          maxlabellen=maxlabellen))
        
    def calc_evolution(self):
        nstates = self.nstates
        start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step
        start_pts = range(start_iter, stop_iter, step_iter)

        pop_evol = numpy.zeros((len(start_pts), nstates), dtype=ci_dtype)

        pi = self.progress.indicator
        pi.new_operation('Calculating population evolution', len(start_pts)*nstates)
#         futures = []
#         for iblock, start in enumerate(start_pts):
#             if self.evolution_mode == 'cumulative':
#                 block_start = start_iter
#             else: # self.evolution_mode == 'blocked'
#                 block_start = start
#             stop = min(start+step_iter, stop_iter)
# 
#             for istate in xrange(nstates):
#                 future = self.work_manager.submit(_eval_block,kwargs=dict(iblock=iblock,istate=istate,
#                                                                           start=block_start,stop=stop,
#                                                                           state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
#                                                                           mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
#                                                                           mcbs_acalpha = self.mcbs_acalpha))
#                 futures.append(future)
        def taskgen():
            for iblock, start in enumerate(start_pts):
                if self.evolution_mode == 'cumulative':
                    block_start = start_iter
                else: # self.evolution_mode == 'blocked'
                    block_start = start
                stop = min(start+step_iter, stop_iter)
     
                for istate in xrange(nstates):
                    yield (_eval_block,(),dict(iblock=iblock,istate=istate,
                                               start=block_start,stop=stop,
                                               state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
                                               mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
                                               mcbs_acalpha = self.mcbs_acalpha))
        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(taskgen(), self.max_queue_len):
            (iblock,istate,ci_res) = future.get_result(discard=True)
            pop_evol[iblock,istate] =  ci_res
            pi.progress += 1

        self.output_file.create_dataset('state_pop_evolution', data=pop_evol, shuffle=True, compression=9)
        pi.clear()

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.nstates = self.assignments_file.attrs['nstates']

            state_labels = self.state_labels = self.assignments_file['state_labels'][...]
            state_map = self.state_map = self.assignments_file['state_map'][...]
            if (state_map > nstates).any():
                raise ValueError('invalid state mapping')

            # copy metadata to output
            self.output_file.attrs['nstates'] = nstates
            self.output_file['state_labels'] = state_labels

            # calculate overall averages
            self.calc_state_pops()

            # calculate evolution, if requested
            if self.evolution_mode != 'none' and self.iter_range.iter_step:
                self.calc_evolution()
Пример #41
0
class WAssign(WESTParallelTool):
    prog = 'w_assign'
    description = '''\
Assign walkers to bins, producing a file (by default named "assign.h5")
which can be used in subsequent analysis.

For consistency in subsequent analysis operations, the entire dataset
must be assigned, even if only a subset of the data will be used. This
ensures that analyses that rely on tracing trajectories always know the
originating bin of each trajectory.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Specifying macrostates
-----------------------------------------------------------------------------

Optionally, kinetic macrostates may be defined in terms of sets of bins.
Each trajectory will be labeled with the kinetic macrostate it was most
recently in at each timepoint, for use in subsequent kinetic analysis.
This is required for all kinetics analysis (w_kintrace and w_kinmat).

There are three ways to specify macrostates:
  
  1. States corresponding to single bins may be identified on the command
     line using the --states option, which takes multiple arguments, one for
     each state (separated by spaces in most shells). Each state is specified
     as a coordinate tuple, with an optional label prepended, as in
     ``bound:1.0`` or ``unbound:(2.5,2.5)``. Unlabeled states are named
     ``stateN``, where N is the (zero-based) position in the list of states
     supplied to --states.
     
  2. States corresponding to multiple bins may use a YAML input file specified
     with --states-from-file. This file defines a list of states, each with a
     name and a list of coordinate tuples; bins containing these coordinates
     will be mapped to the containing state. For instance, the following
     file::

        ---
        states:
          - label: unbound
            coords:
              - [9.0, 1.0]
              - [9.0, 2.0]
          - label: bound
            coords:
              - [0.1, 0.0]

     produces two macrostates: the first state is called "unbound" and
     consists of bins containing the (2-dimensional) progress coordinate
     values (9.0, 1.0) and (9.0, 2.0); the second state is called "bound"
     and consists of the single bin containing the point (0.1, 0.0).
     
  3. Arbitrary state definitions may be supplied by a user-defined function,
     specified as --states-from-function=MODULE.FUNCTION. This function is
     called with the bin mapper as an argument (``function(mapper)``) and must
     return a list of dictionaries, one per state. Each dictionary must contain
     a vector of coordinate tuples with key "coords"; the bins into which each
     of these tuples falls define the state. An optional name for the state
     (with key "label") may also be provided.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "assign.h5") contains the following
attributes datasets:

  ``nbins`` attribute
    *(Integer)* Number of valid bins. Bin assignments range from 0 to
    *nbins*-1, inclusive.

  ``nstates`` attribute
    *(Integer)* Number of valid macrostates (may be zero if no such states are
    specified). Trajectory ensemble assignments range from 0 to *nstates*-1,
    inclusive, when states are defined.

  ``/assignments`` [iteration][segment][timepoint]
    *(Integer)* Per-segment and -timepoint assignments (bin indices).

  ``/npts`` [iteration]
    *(Integer)* Number of timepoints in each iteration.

  ``/nsegs`` [iteration]
    *(Integer)* Number of segments in each iteration.

  ``/labeled_populations`` [iterations][state][bin]
    *(Floating-point)* Per-iteration and -timepoint bin populations, labeled
    by most recently visited macrostate. The last state entry (*nstates-1*)
    corresponds to trajectories initiated outside of a defined macrostate.

  ``/bin_labels`` [bin]
    *(String)* Text labels of bins.

When macrostate assignments are given, the following additional datasets are
present:

  ``/trajlabels`` [iteration][segment][timepoint]
    *(Integer)* Per-segment and -timepoint trajectory labels, indicating the
    macrostate which each trajectory last visited.

  ``/state_labels`` [state]
    *(String)* Labels of states.

  ``/state_map`` [bin]
    *(Integer)* Mapping of bin index to the macrostate containing that bin.
    An entry will contain *nbins+1* if that bin does not fall into a 
    macrostate.
    
Datasets indexed by state and bin contain one more entry than the number of
valid states or bins. For *N* bins, axes indexed by bin are of size *N+1*, and
entry *N* (0-based indexing) corresponds to a walker outside of the defined bin
space (which will cause most mappers to raise an error). More importantly, for
*M* states (including the case *M=0* where no states are specified), axes
indexed by state are of size *M+1* and entry *M* refers to trajectories
initiated in a region not corresponding to a defined macrostate.

Thus, ``labeled_populations[:,:,:].sum(axis=1)[:,:-1]`` gives overall per-bin
populations, for all defined bins and 
``labeled_populations[:,:,:].sum(axis=2)[:,:-1]`` gives overall
per-trajectory-ensemble populations for all defined states. 

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading/calculating input
data.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WAssign, self).__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        self.data_reader = WESTDataReader()
        self.dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.binning = BinMappingComponent()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.states = []
        self.subsample = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.binning.add_args(parser)
        self.dssynth.add_args(parser)

        sgroup = parser.add_argument_group(
            'macrostate definitions').add_mutually_exclusive_group()
        sgroup.add_argument(
            '--states',
            nargs='+',
            metavar='STATEDEF',
            help=
            '''Single-bin kinetic macrostate, specified by a coordinate tuple (e.g. '1.0' or '[1.0,1.0]'),
                            optionally labeled (e.g. 'bound:[1.0,1.0]'). States corresponding to multiple bins
                            must be specified with --states-from-file.''')
        sgroup.add_argument(
            '--states-from-file',
            metavar='STATEFILE',
            help=
            '''Load kinetic macrostates from the YAML file STATEFILE. See description
                            above for the appropriate structure.''')
        sgroup.add_argument(
            '--states-from-function',
            metavar='STATEFUNC',
            help=
            '''Load kinetic macrostates from the function STATEFUNC, specified as
                            module_name.func_name. This function is called with the bin mapper as an argument,
                            and must return a list of dictionaries {'label': state_label, 'coords': 2d_array_like}
                            one for each macrostate; the 'coords' entry must contain enough rows to identify all bins
                            in the macrostate.''')

        agroup = parser.add_argument_group('other options')
        agroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='assign.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')
        agroup.add_argument(
            '--subsample',
            dest='subsample',
            action='store_const',
            const=True,
            help='''Determines whether or not the data should be subsampled.
                             This is rather useful for analysing steady state simulations.'''
        )
        agroup.add_argument(
            '--config-from-file',
            dest='config_from_file',
            action='store_true',
            help=
            '''Load bins/macrostates from a scheme specified in west.cfg.''')
        agroup.add_argument('--scheme-name',
                            dest='scheme',
                            help='''Name of scheme specified in west.cfg.''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        # Necessary to open the file to get the current iteration
        # if we want to use the mapper in the file
        self.data_reader.open(mode='r+')
        self.n_iter = self.data_reader.current_iteration
        # If we decide to use this option for iteration selection:
        # getattr(args,'bins_from_h5file',None) or self.data_reader.current_iteration

        with self.data_reader:
            self.dssynth.h5filename = self.data_reader.we_h5filename
            self.dssynth.process_args(args)
            if args.config_from_file == False:
                self.binning.set_we_h5file_info(self.n_iter, self.data_reader)
                self.binning.process_args(args)

        self.output_filename = args.output

        if args.config_from_file:
            if not args.scheme:
                raise ValueError('A scheme must be specified.')
            else:
                self.load_config_from_west(args.scheme)
        elif args.states:
            self.parse_cmdline_states(args.states)
        elif args.states_from_file:
            self.load_state_file(args.states_from_file)
        elif args.states_from_function:
            self.load_states_from_function(
                get_object(args.states_from_function, path=['.']))

        if self.states and len(self.states) < 2:
            raise ValueError('zero, two, or more macrostates are required')

        #self.output_file = WESTPAH5File(args.output, 'w', creating_program=True)
        log.debug('state list: {!r}'.format(self.states))

        self.subsample = args.subsample if args.subsample is not None else False

    def parse_cmdline_states(self, state_strings):
        states = []
        for istring, state_string in enumerate(state_strings):
            try:
                (label, coord_str) = state_string.split(':')
            except ValueError:
                label = 'state{}'.format(istring)
                coord_str = state_string
            coord = parse_pcoord_value(coord_str)
            states.append({'label': label, 'coords': coord})
        self.states = states

    def load_config_from_west(self, scheme):
        try:
            config = westpa.rc.config['west']['analysis']
        except:
            raise ValueError('There is no configuration file specified.')
        ystates = config['analysis_schemes'][scheme]['states']
        self.states_from_dict(ystates)
        try:
            self.subsample = config['subsample']
        except:
            pass
        from westpa._rc import bins_from_yaml_dict
        self.binning.mapper = bins_from_yaml_dict(
            config['analysis_schemes'][scheme]['bins'][0])
        import os
        path = os.path.join(os.getcwd(), config['directory'], scheme)
        try:
            os.mkdir(config['directory'])
            os.mkdir(path)
        except:
            pass

        self.output_filename = os.path.join(path, 'assign.h5')

    def load_state_file(self, state_filename):
        import yaml
        ydict = yaml.load(open(state_filename, 'rt'))
        ystates = ydict['states']
        self.states_from_dict(ystates)

    def states_from_dict(self, ystates):
        states = []
        for istate, ystate in enumerate(ystates):
            state = {}
            state['label'] = ystate.get('label', 'state{}'.format(istate))
            # coords can be:
            #  - a scalar, in which case it is one bin, 1-D
            #  - a single list, which is rejected as ambiguous
            #  - a list of lists, which is a list of coordinate tuples
            coords = numpy.array(ystate['coords'])
            if coords.ndim == 0:
                coords.shape = (1, 1)
            elif coords.ndim == 1:
                raise ValueError(
                    'list {!r} is ambiguous (list of 1-d coordinates, or single multi-d coordinate?)'
                    .format(ystate['coords']))
            elif coords.ndim > 2:
                raise ValueError('coordinates must be 2-D')
            state['coords'] = coords
            states.append(state)
        self.states = states

    def load_states_from_function(self, statefunc):
        states = statefunc(self.binning.mapper)
        for istate, state in enumerate(states):
            state.setdefault('label', 'state{}'.format(istate))
            try:
                state['coords'] = numpy.array(state['coords'])
            except KeyError:
                raise ValueError(
                    'state function {!r} returned a state {!r} without coordinates'
                    .format(statefunc, state))
        self.states = states
        log.debug('loaded states: {!r}'.format(self.states))

    def assign_iteration(self, n_iter, nstates, nbins, state_map, last_labels):
        ''' Method to encapsulate the segment slicing (into n_worker slices) and parallel job submission
            Submits job(s), waits on completion, splices them back together
            Returns: assignments, trajlabels, pops for this iteration'''

        futures = []

        iter_group = self.data_reader.get_iter_group(n_iter)
        nsegs, npts = iter_group['pcoord'].shape[:2]
        n_workers = self.work_manager.n_workers or 1
        assignments = numpy.empty((nsegs, npts), dtype=index_dtype)
        trajlabels = numpy.empty((nsegs, npts), dtype=index_dtype)
        statelabels = numpy.empty((nsegs, npts), dtype=index_dtype)
        pops = numpy.zeros((nstates + 1, nbins + 1), dtype=weight_dtype)

        #Submit jobs to work manager
        blocksize = nsegs // n_workers
        if nsegs % n_workers > 0:
            blocksize += 1

        def task_gen():
            if __debug__:
                checkset = set()
            for lb in range(0, nsegs, blocksize):
                ub = min(nsegs, lb + blocksize)
                if __debug__:
                    checkset.update(set(range(lb, ub)))
                args = ()
                kwargs = dict(
                    n_iter=n_iter,
                    lb=lb,
                    ub=ub,
                    mapper=self.binning.mapper,
                    nstates=nstates,
                    state_map=state_map,
                    last_labels=last_labels,
                    parent_id_dsspec=self.data_reader.parent_id_dsspec,
                    weight_dsspec=self.data_reader.weight_dsspec,
                    pcoord_dsspec=self.dssynth.dsspec,
                    subsample=self.subsample)
                yield (_assign_label_pop, args, kwargs)

                #futures.append(self.work_manager.submit(_assign_label_pop,
                #kwargs=)
            if __debug__:
                assert checkset == set(
                    range(nsegs)), 'segments missing: {}'.format(
                        set(range(nsegs)) - checkset)

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
                task_gen(), queue_size=self.max_queue_len):
            assign_slice, traj_slice, slice_pops, lb, ub, state_slice = future.get_result(
                discard=True)
            assignments[lb:ub, :] = assign_slice
            trajlabels[lb:ub, :] = traj_slice
            statelabels[lb:ub, :] = state_slice
            pops += slice_pops
            del assign_slice, traj_slice, slice_pops, state_slice

        del futures
        return (assignments, trajlabels, pops, statelabels)

    def go(self):
        assert self.data_reader.parent_id_dsspec._h5file is None
        assert self.data_reader.weight_dsspec._h5file is None
        if hasattr(self.dssynth.dsspec, '_h5file'):
            assert self.dssynth.dsspec._h5file is None
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi, self.data_reader, WESTPAH5File(
                self.output_filename, 'w',
                creating_program=True) as self.output_file:
            assign = self.binning.mapper.assign

            # We always assign the entire simulation, so that no trajectory appears to start
            # in a transition region that doesn't get initialized in one.
            iter_start = 1
            iter_stop = self.data_reader.current_iteration

            h5io.stamp_iter_range(self.output_file, iter_start, iter_stop)

            nbins = self.binning.mapper.nbins
            self.output_file.attrs['nbins'] = nbins

            state_map = numpy.empty((self.binning.mapper.nbins + 1, ),
                                    index_dtype)
            state_map[:] = 0  # state_id == nstates => unknown state

            # Recursive mappers produce a generator rather than a list of labels
            # so consume the entire generator into a list
            labels = [
                numpy.string_(label) for label in self.binning.mapper.labels
            ]

            self.output_file.create_dataset('bin_labels',
                                            data=labels,
                                            compression=9)

            if self.states:
                nstates = len(self.states)
                state_map[:] = nstates  # state_id == nstates => unknown state
                state_labels = [
                    numpy.string_(state['label']) for state in self.states
                ]

                for istate, sdict in enumerate(self.states):
                    assert state_labels[istate] == numpy.string_(
                        sdict['label'])  #sanity check
                    state_assignments = assign(sdict['coords'])
                    for assignment in state_assignments:
                        state_map[assignment] = istate
                self.output_file.create_dataset('state_map',
                                                data=state_map,
                                                compression=9,
                                                shuffle=True)
                self.output_file[
                    'state_labels'] = state_labels  #+ ['(unknown)']
            else:
                nstates = 0
            self.output_file.attrs['nstates'] = nstates
            # Stamp if this has been subsampled.
            self.output_file.attrs['subsampled'] = self.subsample

            iter_count = iter_stop - iter_start
            nsegs = numpy.empty((iter_count, ), seg_id_dtype)
            npts = numpy.empty((iter_count, ), seg_id_dtype)

            # scan for largest number of segments and largest number of points
            pi.new_operation('Scanning for segment and point counts',
                             iter_stop - iter_start)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                iter_group = self.data_reader.get_iter_group(n_iter)
                nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2]
                pi.progress += 1
                del iter_group

            pi.new_operation('Preparing output')

            # create datasets
            self.output_file.create_dataset('nsegs',
                                            data=nsegs,
                                            shuffle=True,
                                            compression=9)
            self.output_file.create_dataset('npts',
                                            data=npts,
                                            shuffle=True,
                                            compression=9)

            max_nsegs = nsegs.max()
            max_npts = npts.max()

            assignments_shape = (iter_count, max_nsegs, max_npts)
            assignments_dtype = numpy.min_scalar_type(nbins)
            assignments_ds = self.output_file.create_dataset(
                'assignments',
                dtype=assignments_dtype,
                shape=assignments_shape,
                compression=4,
                shuffle=True,
                chunks=h5io.calc_chunksize(assignments_shape,
                                           assignments_dtype),
                fillvalue=nbins)
            if self.states:
                trajlabel_dtype = numpy.min_scalar_type(nstates)
                trajlabels_ds = self.output_file.create_dataset(
                    'trajlabels',
                    dtype=trajlabel_dtype,
                    shape=assignments_shape,
                    compression=4,
                    shuffle=True,
                    chunks=h5io.calc_chunksize(assignments_shape,
                                               trajlabel_dtype),
                    fillvalue=nstates)
                statelabels_ds = self.output_file.create_dataset(
                    'statelabels',
                    dtype=trajlabel_dtype,
                    shape=assignments_shape,
                    compression=4,
                    shuffle=True,
                    chunks=h5io.calc_chunksize(assignments_shape,
                                               trajlabel_dtype),
                    fillvalue=nstates)

            pops_shape = (iter_count, nstates + 1, nbins + 1)
            pops_ds = self.output_file.create_dataset(
                'labeled_populations',
                dtype=weight_dtype,
                shape=pops_shape,
                compression=4,
                shuffle=True,
                chunks=h5io.calc_chunksize(pops_shape, weight_dtype))
            h5io.label_axes(
                pops_ds,
                [numpy.string_(i) for i in ['iteration', 'state', 'bin']])

            pi.new_operation('Assigning to bins', iter_stop - iter_start)
            last_labels = None  # mapping of seg_id to last macrostate inhabited
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                #get iteration info in this block

                if iiter == 0:
                    last_labels = numpy.empty((nsegs[iiter], ), index_dtype)
                    last_labels[:] = nstates  #unknown state

                #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together
                assignments, trajlabels, pops, statelabels = self.assign_iteration(
                    n_iter, nstates, nbins, state_map, last_labels)

                ##Do stuff with this iteration's results

                last_labels = trajlabels[:, -1].copy()
                assignments_ds[iiter, 0:nsegs[iiter],
                               0:npts[iiter]] = assignments
                pops_ds[iiter] = pops
                if self.states:
                    trajlabels_ds[iiter, 0:nsegs[iiter],
                                  0:npts[iiter]] = trajlabels
                    statelabels_ds[iiter, 0:nsegs[iiter],
                                   0:npts[iiter]] = statelabels

                pi.progress += 1
                del assignments, trajlabels, pops, statelabels

            for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels':
                h5io.stamp_iter_range(self.output_file[dsname], iter_start,
                                      iter_stop)
Пример #42
0
class WCrawl(WESTParallelTool):
    prog='w_crawl'
    description = '''\
Crawl a weighted ensemble dataset, executing a function for each iteration.
This can be used for postprocessing of trajectories, cleanup of datasets,
or anything else that can be expressed as "do X for iteration N, then do
something with the result". Tasks are parallelized by iteration, and 
no guarantees are made about evaluation order.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        tgroup = parser.add_argument_group('task options')
        tgroup.add_argument('-c', '--crawler-instance',
                            help='''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of
                            WESTPACrawler to coordinate the calculation. Required only if initialization,
                            finalization, or task result processing is required.''')
        tgroup.add_argument('task_callable',
                            help='''Run TASK_CALLABLE (specified as module.function) on each iteration.
                            Required.''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        self.task_callable = get_object(args.task_callable, path=['.'])
        if args.crawler_instance is not None:
            self.crawler = get_object(args.crawler_instance, path=['.'])
        else:
            self.crawler = WESTPACrawler()

    def go(self):
        iter_start = self.iter_range.iter_start
        iter_stop = self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        self.data_reader.open('r')
        pi = self.progress.indicator
        with pi:
            pi.operation = 'Initializing'
            self.crawler.initialize(iter_start, iter_stop)

            try:
                pi.new_operation('Dispatching tasks & processing results', iter_count)
                task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in xrange(iter_start,iter_stop))
                for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
                    n_iter, result = future.get_result(discard=True)
                    if self.crawler is not None:
                        self.crawler.process_iter_result(n_iter,result)
                    pi.progress += 1
            finally:
                pi.new_operation('Finalizing')
                self.crawler.finalize()
Пример #43
0
class WDumpSegs(WESTTool):
    prog='w_dumpsegs'
    description = '''\
Dump segment data as text. This is very inefficient, so this tool should be used
as a last resort (use hdfview/h5ls to look at data, and access HDF5 directly for
significant analysis tasks). 
'''
    
    def __init__(self):
        super(WDumpSegs,self).__init__()
        self.data_reader = WESTDataReader()
        self.n_iter = None
        self.output_file = None
        self.print_pcoords = False
    
    def add_args(self, parser):
        self.data_reader.add_args(parser)


        parser.add_argument('-p', '--print-pcoords', dest='print_pcoords', action='store_true',
                            help='print initial and final progress coordinates for each segment')
        parser.add_argument('-i', '--iteration', dest='n_iter', type=int,
                            help='Use data from iteration N_ITER (default: last complete iteration)')
        parser.add_argument('-o', '--output', dest='output_file',
                            help='Store output in OUTPUT_FILE (default: write to standard output).')
        

    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.n_iter = args.n_iter or self.data_reader.current_iteration-1
        self.output_file = open(args.output_file, 'wt') if args.output_file else sys.stdout
        self.print_pcoords = args.print_pcoords
    
    def go(self):
        segments = self.data_reader.get_segments(self.n_iter)
        
        max_seg_id_len = len(str(max(segment.seg_id for segment in segments)))
        max_status_name_len = max(map(len,Segment.status_names.itervalues()))
        max_endpoint_type_len = max(map(len,Segment.endpoint_type_names.itervalues()))
        max_n_parents_len = len(str(max(len(segment.wtg_parent_ids) for segment in segments)))
        
        report_line = ( '{segment.n_iter:d}  {segment.seg_id:{max_seg_id_len}d}  {segment.weight:20.14g}' 
                        +'  {status_name:{max_status_name_len}s} ({segment.status})'
                        +'  {segment.walltime:<12.6g} {segment.cputime:<12.6g}'
                        +'  {endpoint_type_name:{max_endpoint_type_len}s} ({segment.endpoint_type})'
                        +'  {n_parents:{max_n_parents_len}d} {segment.parent_id:{max_seg_id_len}d} {parents_str}' 
                        +'\n')
        pcoord_lines = ('  pcoord[0]  = {init_pcoord}\n  pcoord[-1] = {final_pcoord}'
                        +'\n')
        for (_seg_id, segment) in enumerate(segments):    
            parents_str = '['+', '.join(map(str,sorted(segment.wtg_parent_ids)))+']'
            init_pcoord_str = '[' + ', '.join('{pcval:<12.6g}'.format(pcval=float(pce)) for pce in segment.pcoord[0]) + ']'
            final_pcoord_str = '[' + ', '.join('{pcval:<12.6g}'.format(pcval=float(pce)) for pce in segment.pcoord[-1]) + ']'
            self.output_file.write(report_line.format(segment=segment, 
                                                 status_name = segment.status_names[segment.status],
                                                 endpoint_type_name = segment.endpoint_type_names[segment.endpoint_type],
                                                 parents_str = parents_str,
                                                 n_parents = len(segment.wtg_parent_ids),
                                                 max_seg_id_len=max_seg_id_len,
                                                 max_status_name_len=max_status_name_len,
                                                 max_endpoint_type_len=max_endpoint_type_len,
                                                 max_n_parents_len=max_n_parents_len))
            if self.print_pcoords:
                self.output_file.write(pcoord_lines.format(init_pcoord = init_pcoord_str,final_pcoord = final_pcoord_str))
Пример #44
0
class WDumpSegs(WESTTool):
    prog = 'w_dumpsegs'
    description = '''\
Dump segment data as text. This is very inefficient, so this tool should be used
as a last resort (use hdfview/h5ls to look at data, and access HDF5 directly for
significant analysis tasks). 
'''

    def __init__(self):
        super(WDumpSegs, self).__init__()
        self.data_reader = WESTDataReader()
        self.n_iter = None
        self.output_file = None
        self.print_pcoords = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)

        parser.add_argument(
            '-p',
            '--print-pcoords',
            dest='print_pcoords',
            action='store_true',
            help='print initial and final progress coordinates for each segment'
        )
        parser.add_argument(
            '-i',
            '--iteration',
            dest='n_iter',
            type=int,
            help=
            'Use data from iteration N_ITER (default: last complete iteration)'
        )
        parser.add_argument(
            '-o',
            '--output',
            dest='output_file',
            help=
            'Store output in OUTPUT_FILE (default: write to standard output).')

    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.n_iter = args.n_iter or self.data_reader.current_iteration - 1
        self.output_file = open(args.output_file,
                                'wt') if args.output_file else sys.stdout
        self.print_pcoords = args.print_pcoords

    def go(self):
        segments = self.data_reader.get_segments(self.n_iter)

        max_seg_id_len = len(str(max(segment.seg_id for segment in segments)))
        max_status_name_len = max(
            list(map(len, iter(Segment.status_names.values()))))
        max_endpoint_type_len = max(
            list(map(len, iter(Segment.endpoint_type_names.values()))))
        max_n_parents_len = len(
            str(max(len(segment.wtg_parent_ids) for segment in segments)))

        report_line = (
            '{segment.n_iter:d}  {segment.seg_id:{max_seg_id_len}d}  {segment.weight:20.14g}'
            + '  {status_name:{max_status_name_len}s} ({segment.status})' +
            '  {segment.walltime:<12.6g} {segment.cputime:<12.6g}' +
            '  {endpoint_type_name:{max_endpoint_type_len}s} ({segment.endpoint_type})'
            +
            '  {n_parents:{max_n_parents_len}d} {segment.parent_id:{max_seg_id_len}d} {parents_str}'
            + '\n')
        pcoord_lines = (
            '  pcoord[0]  = {init_pcoord}\n  pcoord[-1] = {final_pcoord}' +
            '\n')
        for (_seg_id, segment) in enumerate(segments):
            parents_str = '[' + ', '.join(
                map(str, sorted(segment.wtg_parent_ids))) + ']'
            init_pcoord_str = '[' + ', '.join(
                '{pcval:<12.6g}'.format(pcval=float(pce))
                for pce in segment.pcoord[0]) + ']'
            final_pcoord_str = '[' + ', '.join(
                '{pcval:<12.6g}'.format(pcval=float(pce))
                for pce in segment.pcoord[-1]) + ']'
            self.output_file.write(
                report_line.format(
                    segment=segment,
                    status_name=segment.status_names[segment.status],
                    endpoint_type_name=segment.endpoint_type_names[
                        segment.endpoint_type],
                    parents_str=parents_str,
                    n_parents=len(segment.wtg_parent_ids),
                    max_seg_id_len=max_seg_id_len,
                    max_status_name_len=max_status_name_len,
                    max_endpoint_type_len=max_endpoint_type_len,
                    max_n_parents_len=max_n_parents_len))
            if self.print_pcoords:
                self.output_file.write(
                    pcoord_lines.format(init_pcoord=init_pcoord_str,
                                        final_pcoord=final_pcoord_str))
Пример #45
0
class WIPI(WESTParallelTool):
    '''
        Welcome to w_ipa (WESTPA Interactive Python Analysis)!
        From here, you can run traces, look at weights, progress coordinates, etc.
        This is considered a 'stateful' tool; that is, the data you are pulling is always pulled
        from the current analysis scheme and iteration.
        By default, the first analysis scheme in west.cfg is used, and you are set at iteration 1.

        ALL PROPERTIES ARE ACCESSED VIA w or west
        To see the current iteration, try:

            w.iteration
            OR
            west.iteration

        to set it, simply plug in a new value.

            w.iteration = 100

        To change/list the current analysis schemes:

            w.list_schemes
            w.scheme = OUTPUT FROM w.list_schemes

        To see the states and bins defined in the current analysis scheme:

            w.states
            w.bin_labels

        All information about the current iteration is available in an object called 'current':

            w.current
            walkers, summary, states, seg_id, weights, parents, kinavg, pcoord, bins, populations, and auxdata, if it exists.

        In addition, the function w.trace(seg_id) will run a trace over a seg_id in the current iteration and return a dictionary
        containing all pertinent information about that seg_id's history.  It's best to store this, as the trace can be expensive.

        Run help on any function or property for more information!

        Happy analyzing!
                
    '''

    def __init__(self):
        super(WIPI,self).__init__()
        self.data_reader = WESTDataReader()
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        self.progress = ProgressIndicatorComponent()

        self._iter = 1
        self.config_required = True
        self.version = "1.0B"
        # Set to matplotlib if you want that.  But why would you?
        # Well, whatever, we'll just set it to that for now.
        self.interface = 'matplotlib'
        self._scheme = None
        global iteration

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        rgroup = parser.add_argument_group('runtime options')
        rgroup.add_argument('--analysis-only', '-ao', dest='analysis_mode', action='store_true',
                             help='''Use this flag to run the analysis and return to the terminal.''')
        rgroup.add_argument('--reanalyze', '-ra', dest='reanalyze', action='store_true',
                             help='''Use this flag to delete the existing files and reanalyze.''')
        rgroup.add_argument('--ignore-hash', '-ih', dest='ignore_hash', action='store_true',
                             help='''Ignore hash and don't regenerate files.''')
        rgroup.add_argument('--debug', '-d', dest='debug_mode', action='store_true',
                             help='''Debug output largely intended for development.''')
        rgroup.add_argument('--terminal', '-t', dest='plotting', action='store_true',
                             help='''Plot output in terminal.''')
        # There is almost certainly a better way to handle this, but we'll sort that later.
        import argparse
        rgroup.add_argument('--f', '-f', dest='extra', default='blah',
                             help=argparse.SUPPRESS)
        
        parser.set_defaults(compression=True)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.niters = self.data_reader.current_iteration - 1
        self.__config = westpa.rc.config
        self.__settings = self.__config['west']['analysis']
        for ischeme, scheme in enumerate(self.__settings['analysis_schemes']):
            if (self.__settings['analysis_schemes'][scheme]['enabled'] == True or self.__settings['analysis_schemes'][scheme]['enabled'] == None):
                self.scheme = scheme
        self.data_args = args
        self.analysis_mode = args.analysis_mode
        self.reanalyze = args.reanalyze
        self.ignore_hash = args.ignore_hash
        self.debug_mode = args.debug_mode
        if args.plotting:
            self.interface = 'text'

    def hash_args(self, args, extra=None, path=None):
        '''Create unique hash stamp to determine if arguments/file is different from before.'''
        '''Combine with iteration to know whether or not file needs updating.'''
        # Why are we not loading this functionality into the individual tools?
        # While it may certainly be useful to store arguments (and we may well do that),
        # it's rather complex and nasty to deal with pickling and hashing arguments through
        # the various namespaces.
        # In addition, it's unlikely that the functionality is desired at the individual tool level,
        # since we'll always just rewrite a file when we call the function.
        #return hashlib.md5(pickle.dumps([args, extra])).hexdigest()
        # We don't care about the path, so we'll remove it.
        # Probably a better way to do this, but who cares.
        cargs = list(args)
        for iarg, arg in enumerate(cargs):
            if path in arg:
                cargs[iarg] = arg.replace(path,'').replace('/', '')
            if arg == '--disable-averages':
                cargs.remove('--disable-averages')
        to_hash = cargs + [extra]
        #print(args)
        #print(to_hash)
        #print(str(to_hash).encode('base64'))
        if self.debug_mode:
            for iarg, arg in enumerate(to_hash):
                if not isinstance(arg, list):
                    print('arg {num:02d} -- {arg:<20}'.format(num=iarg, arg=arg))
                else:
                    for il, l in enumerate(arg):
                        print('arg {num:02d} -- {arg:<20}'.format(num=il+iarg, arg=l))
            #print('args: {}'.format(to_hash))
        # This SHOULD produce the same output, maybe?  That would be nice, anyway.
        # But we'll need to test it more.
        return hashlib.md5(base64.b64encode(str(to_hash).encode())).hexdigest()

    def stamp_hash(self, h5file_name, new_hash):
        '''Loads a file, stamps it, and returns the opened file in read only'''
        h5file = h5io.WESTPAH5File(h5file_name, 'r+')
        h5file.attrs['arg_hash'] = new_hash
        h5file.close()
        h5file = h5io.WESTPAH5File(h5file_name, 'r')
        return h5file

    def analysis_structure(self):
        '''
        Run automatically on startup.  Parses through the configuration file, and loads up all the data files from the different 
        analysis schematics.  If they don't exist, it creates them automatically by hooking in to existing analysis routines 
        and going from there.  

        It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list
        of args.  The user can specify everything in the configuration file that would have been specified on the command line.

        For instance, were one to call w_direct as follows:

            w_direct --evolution cumulative --step-iter 1 --disable-correl

        the west.cfg would look as follows:

        west:
          analysis:
            w_direct:
              evolution: cumulative
              step_iter: 1
              extra: ['disable-correl']

        Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced
        with 'kinetics'.
        '''
        # Make sure everything exists.
        try:
            os.mkdir(self.__settings['directory'])
        except:
            pass
        # Now, check to see whether they exist, and then load them.
        self.__analysis_schemes__ = {}
        # We really need to implement some sort of default behavior if an analysis scheme isn't set.
        # Right now, we just crash.  That isn't really graceful.
        for scheme in self.__settings['analysis_schemes']:
            if self.__settings['analysis_schemes'][scheme]['enabled']:
                if self.work_manager.running == False:
                    self.work_manager.startup()
                path = os.path.join(os.getcwd(), self.__settings['directory'], scheme)
                #if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']:
                # Should clean this up.  But it uses the default global setting if a by-scheme one isn't set.
                if 'postanalysis' in self.__settings:
                    if 'postanalysis' in self.__settings['analysis_schemes'][scheme]:
                        pass
                    else:
                        self.__settings['analysis_schemes'][scheme]['postanalysis'] = self.__settings['postanalysis']
                try:
                    os.mkdir(path)
                except:
                    pass
                self.__analysis_schemes__[scheme] = {}
                try:
                    if self.__settings['analysis_schemes'][scheme]['postanalysis'] == True or self.__settings['postanalysis'] == True:
                        analysis_files = ['assign', 'direct', 'reweight']
                    else:
                        analysis_files = ['assign', 'direct']
                except:
                    analysis_files = ['assign', 'direct']
                    self.__settings['analysis_schemes'][scheme]['postanalysis'] = False
                reanalyze_kinetics = False
                assign_hash = None
                for name in analysis_files:
                    arg_hash = None
                    if self.reanalyze == True:
                        reanalyze_kinetics = True
                        try:
                            os.remove(os.path.join(path, '{}.h5'.format(name)))
                        except:
                            pass
                    else:
                        try:
                            # Try to load the hash.  If we fail to load the hash or the file, we need to reload.
                            #if self.reanalyze == True:
                            #    raise ValueError('Reanalyze set to true.')
                            self.__analysis_schemes__[scheme][name] = h5io.WESTPAH5File(os.path.join(path, '{}.h5'.format(name)), 'r')
                            arg_hash = self.__analysis_schemes__[scheme][name].attrs['arg_hash']
                            if name == 'assign':
                                assign_hash = arg_hash
                        except:
                            pass
                            # We shouldn't rely on this.
                            # self.reanalyze = True
                    if True:
                        if name == 'assign':
                            assign = w_assign.WAssign()

                            w_assign_config = { 'output': os.path.join(path, '{}.h5'.format(name))}
                            try:
                                w_assign_config.update(self.__settings['w_assign'])
                            except:
                                pass
                            try:
                                w_assign_config.update(self.__settings['analysis_schemes'][scheme]['w_assign'])
                            except:
                                pass
                            args = []
                            for key,value in w_assign_config.items():
                                if key != 'extra':
                                    args.append(str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(w_assign_config.keys()):
                                # We're sorting to ensure that the order doesn't matter.
                                for value in sorted(w_assign_config['extra']):
                                    args.append(str('--') + str(value).replace('_', '-'))
                            # We're just calling the built in function.
                            # This is a lot cleaner than what we had in before, and far more workable.
                            args.append('--config-from-file')
                            args.append('--scheme-name')
                            args.append('{}'.format(scheme))
                            # Why are we calling this if we're not sure we're remaking the file?
                            # We need to load up the bin mapper and states and see if they're the same.
                            assign.make_parser_and_process(args=args)
                            import pickle
                            #new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states])
                            # We need to encode it properly to ensure that some OS specific thing doesn't kill us.  Same goes for the args, ultimately.
                            # Mostly, we just need to ensure that we're consistent.
                            new_hash = self.hash_args(args=args, path=path,
                                                      extra=[int(self.niters),
                                                      codecs.encode(pickle.dumps(assign.binning.mapper), "base64"),
                                                      base64.b64encode(str(assign.states).encode())])
                            # Let's check the hash.  If the hash is the same, we don't need to reload.
                            if self.debug_mode == True:
                                print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash))
                            if self.ignore_hash == False and (arg_hash != new_hash or self.reanalyze == True):
                                # If the hashes are different, or we need to reanalyze, delete the file.
                                try:
                                    os.remove(os.path.join(path, '{}.h5'.format(name)))
                                except:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme))
                                #reanalyze_kinetics = True
                                # We want to use the work manager we have here.  Otherwise, just let the tool sort out what it needs, honestly.
                                assign.work_manager = self.work_manager

                                assign.go()
                                assign.data_reader.close()

                                # Stamp w/ hash, then reload as read only.
                                self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash)
                            del(assign)
                            # Update the assignment hash.
                            assign_hash = new_hash

                        # Since these are all contained within one tool, now, we want it to just... load everything.
                        if name == 'direct' or name == 'reweight':
                            assignment_file = self.__analysis_schemes__[scheme]['assign']
                            if name == 'direct':
                                analysis = w_direct.WDirect()
                            if name == 'reweight':
                                analysis = w_reweight.WReweight()
                            
                            analysis_config = { 'assignments': os.path.join(path, '{}.h5'.format('assign')), 'output': os.path.join(path, '{}.h5'.format(name)), 'kinetics': os.path.join(path, '{}.h5'.format(name))}

                            # Pull from general analysis options, then general SPECIFIC options for each analysis,
                            # then general options for that analysis scheme, then specific options for the analysis type in the scheme.

                            try:
                                analysis_config.update(self.__settings['kinetics'])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['w_{}'.format(name)])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['analysis_schemes'][scheme]['kinetics'])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['analysis_schemes'][scheme]['w_{}'.format(name)])
                            except:
                                pass

                            # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command
                            # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in.
                            # The tool has no real idea it's being called outside of its actual function, and we're good to go.
                            args = ['all']
                            for key,value in analysis_config.items():
                                if key != 'extra':
                                    args.append(str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(analysis_config.keys()):
                                for value in sorted(analysis_config['extra']):
                                    args.append(str('--') + str(value).replace('_', '-'))
                            # We want to not display the averages, so...
                            args.append('--disable-averages')
                            new_hash = self.hash_args(args=args, path=path, extra=[int(self.niters), assign_hash])
                            #if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True:
                            if self.debug_mode == True:
                                print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash))
                            if self.ignore_hash == False and (arg_hash != new_hash or reanalyze_kinetics == True):
                                try:
                                    os.remove(os.path.join(path, '{}.h5'.format(name)))
                                except:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme))
                                analysis.make_parser_and_process(args=args)
                                # We want to hook into the existing work manager.
                                analysis.work_manager = self.work_manager

                                analysis.go()

                                # Open!
                                self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash)
                            del(analysis)

        # Make sure this doesn't get too far out, here.  We need to keep it alive as long as we're actually analyzing things.
        # self.work_manager.shutdown()
        print("")
        print("Complete!")

    @property
    def assign(self):
        return self.__analysis_schemes__[str(self.scheme)]['assign']

    @property
    def direct(self):
        """
        The output from w_kinavg.py from the current scheme.
        """
        return self.__analysis_schemes__[str(self.scheme)]['direct']


    @property
    def state_labels(self):
        print("State labels and definitions!")
        for istate, state in enumerate(self.assign['state_labels']):
            print('{}: {}'.format(istate, state))
        print('{}: {}'.format(istate+1, 'Unknown'))

    @property
    def bin_labels(self):
        print("Bin definitions! ")
        for istate, state in enumerate(self.assign['bin_labels']):
            print('{}: {}'.format(istate, state))

    @property
    def west(self):
        return self.data_reader.data_manager.we_h5file

    @property
    def reweight(self):
        if self.__settings['analysis_schemes'][str(self.scheme)]['postanalysis'] == True:
            return self.__analysis_schemes__[str(self.scheme)]['reweight']
        else:
            value = "This sort of analysis has not been enabled."
            current = { 'bin_prob_evolution': value, 'color_prob_evolution': value, 'conditional_flux_evolution': value, 'rate_evolution': value, 'state_labels': value, 'state_prob_evolution': value }
            current.update({ 'bin_populations': value, 'iterations': value })
            return current

    @property
    def scheme(self):
        '''
        Returns and sets what scheme is currently in use.
        To see what schemes are available, run:

            w.list_schemes

        '''
        # Let's do this a few different ways.
        # We want to return things about the DIFFERENT schemes, if possible.
        if self._scheme == None:
            self._scheme = WIPIScheme(scheme=self.__analysis_schemes__, name=self._schemename, parent=self, settings=self.__settings)

        # This just ensures that when we call it, it's clean.
        self._scheme.name = None
        return self._scheme

    @scheme.setter
    def scheme(self, scheme):
        self._future = None
        self._current = None
        self._past = None
        if scheme in self.__settings['analysis_schemes']:
            pass
        else:
            for ischeme, schemename in enumerate(self.__settings['analysis_schemes']):
                if ischeme == scheme:
                    scheme = schemename
        if self.__settings['analysis_schemes'][scheme]['enabled'] == True or self.__settings['analysis_schemes'][scheme]['enabled'] == None:
            self._schemename = scheme
        else:
            print("Scheme cannot be changed to scheme: {}; it is not enabled!".format(scheme))

    @property
    def list_schemes(self):
        '''
        Lists what schemes are configured in west.cfg file.
        Schemes should be structured as follows, in west.cfg:

        west:
          system:
            analysis:
              directory: analysis
              analysis_schemes:
                scheme.1:
                  enabled: True
                  states:
                    - label: unbound
                      coords: [[7.0]]
                    - label: bound
                      coords: [[2.7]]
                  bins:
                    - type: RectilinearBinMapper
                      boundaries: [[0.0, 2.80, 7, 10000]]
        '''
        #print("The following schemes are available:")
        #print("")
        #for ischeme, scheme in enumerate(self.__settings['analysis_schemes']):
        #    print('{}. Scheme: {}'.format(ischeme, scheme))
        #print("")
        #print("Set via name, or via the index listed.")
        #print("")
        #print("Current scheme: {}".format(self.scheme))
        self._scheme.list_schemes

    @property
    def iteration(self):
        '''
        Returns/sets the current iteration.
        '''
        #print("The current iteration is {}".format(self._iter))
        return self._iter

    @iteration.setter
    def iteration(self, value):
        print("Setting iteration to iter {}.".format(value))
        if value <= 0:
            print("Iteration must begin at 1.")
            value = 1
        if value > self.niters:
            print("Cannot go beyond {} iterations!".format(self.niters))
            print("Setting to {}".format(self.niters))
            value = self.niters
        # We want to trigger a rebuild on our current/past/future bits.
        # The scheme should automatically reset to the proper iteration, but
        # future needs to be manually triggered.
        self._iter = value
        self._future = None
        return self._iter


    @property
    def current(self):
        '''
        The current iteration.  See help for __get_data_for_iteration__
        '''
        return self.scheme[self.scheme.scheme].current

    @property
    def past(self):
        '''
        The previous iteration.  See help for __get_data_for_iteration__
        '''
        return self.scheme[self.scheme.scheme].past


    def trace(self, seg_id):
        '''
        Runs a trace on a seg_id within the current iteration, all the way back to the beginning,
        returning a dictionary containing all interesting information:

            seg_id, pcoord, states, bins, weights, iteration, auxdata (optional)

        sorted in chronological order.


        Call with a seg_id.
        '''
        if seg_id >= self.current.walkers:
            print("Walker seg_id # {} is beyond the max count of {} walkers.".format(seg_id, self.current.walkers))
            return 1
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Tracing scheme:iter:seg_id {}:{}:{}'.format(self.scheme, self.iteration, seg_id), self.iteration)
            current = { 'seg_id': [], 'pcoord': [], 'states': [], 'weights': [], 'iteration': [], 'bins': [] }
            keys = []
            try:
                current['auxdata'] = {}
                for key in list(self.current['auxdata'].keys()):
                    current['auxdata'][key] = []
                    key = []
            except:
                pass
            for iter in reversed(list(range(1, self.iteration+1))):
                iter_group = self.data_reader.get_iter_group(iter)
                particles = self.data_reader.data_manager.get_iter_summary(int(iter))['n_particles']
                current['pcoord'].append(iter_group['pcoord'][seg_id, :, :])
                current['states'].append(self.assign['trajlabels'][iter-1, seg_id,:])
                current['bins'].append(self.assign['assignments'][iter-1, seg_id,:])
                current['seg_id'].append(seg_id)
                current['weights'].append(iter_group['seg_index']['weight'][seg_id])
                current['iteration'].append(iter)
                try:
                    for key in keys:
                        current['auxdata'][key].append(iter_group['auxdata'][key][seg_id])
                except:
                    pass
                seg_id = iter_group['seg_index']['parent_id'][seg_id]
                if seg_id < 0:
                    # Necessary for steady state simulations.  This means they started in that iteration.
                    break
                pi.progress += 1
        current['seg_id'] = list(reversed(current['seg_id']))
        current['iteration'] = list(reversed(current['iteration']))
        current['states'] = np.concatenate(np.array(list(reversed(current['states']))))
        current['bins'] = np.concatenate(np.array(list(reversed(current['bins']))))
        current['weights'] = np.array(list(reversed(current['weights'])))
        current['pcoord'] = np.concatenate(np.array(list(reversed(current['pcoord']))))
        try:
            for key in keys():
                current['auxdata'][key] = np.concatenate(np.array(list(reversed(current['auxdata'][key]))))
        except:
            pass
        current['state_labels'] = self.assign['state_labels']
        for i in ['pcoord', 'states', 'bins', 'weights']:
            current[i] = WIPIDataset(raw=current[i], key=i)
            if i == 'weights':
                current[i].plotter = Plotter(np.log10(current[i].raw), str('log10 of ' + str(i)), iteration=current[i].raw.shape[0], interface=self.interface)
            else:
                current[i].plotter = Plotter(current[i].raw, i, iteration=current[i].raw.shape[0], interface=self.interface)
            current[i].plot = current[i].plotter.plot
        return WIPIDataset(raw=current, key=seg_id)

    @property
    def future(self, value=None):
        '''
        Similar to current/past, but keyed differently and returns different datasets.
        See help for Future.
        '''
        if self._future == None:
            self._future = self.Future(raw=self.__get_children__(), key=None)
            self._future.iteration = self.iteration+1
        return self._future

    class Future(WIPIDataset):

        # This isn't a real fancy one.
        def __getitem__(self, value):
            if isinstance(value, str):
                print(list(self.__dict__.keys()))
                try:
                    return self.__dict__['raw'][value]
                except:
                    print('{} is not a valid data structure.'.format(value))
            elif isinstance(value, int) or isinstance(value, np.int64):
                # Otherwise, we assume they're trying to index for a seg_id.
                #if value < self.parent.walkers:
                current = {}
                seg_items = ['weights', 'pcoord', 'auxdata', 'parents', 'seg_id', 'states']
                current['pcoord'] = self.__dict__['raw']['pcoord'][value]
                current['states'] = self.__dict__['raw']['states'][value]
                current['bins'] = self.__dict__['raw']['bins'][value]
                current['parents'] = self.__dict__['raw']['parents'][value]
                current['seg_id'] = self.__dict__['raw']['seg_id'][value]
                current['weights'] = self.__dict__['raw']['weights'][value]
                try:
                    current['auxdata'] = {}
                    for key in list(self.__dict__['raw']['auxdata'].keys()):
                        current['auxdata'][key] = self.__dict__['raw']['auxdata'][key][value]
                except:
                    pass
                current = WIPIDataset(current, 'Segment {} in Iter {}'.format(value, self.iteration))
                return current

    def __get_children__(self):
        '''
        Returns all information about the children of a given walker in the current iteration.
        Used to generate and create the future object, if necessary.
        '''
        
        if self.iteration == self.niters:
            print("Currently at iteration {}, which is the max.  There are no children!".format(self.iteration))
            return 0
        iter_data = __get_data_for_iteration__(value=self.iteration+1, parent=self)
        future = { 'weights': [], 'pcoord': [], 'parents': [], 'summary': iter_data['summary'], 'seg_id': [], 'walkers': iter_data['walkers'], 'states': [], 'bins': [] }
        for seg_id in range(0, self.current.walkers):
            children = np.where(iter_data['parents'] == seg_id)[0]
            if len(children) == 0:
                error = "No children for seg_id {}.".format(seg_id)
                future['weights'].append(error)
                future['pcoord'].append(error)
                future['parents'].append(error)
                future['seg_id'].append(error)
                future['states'].append(error)
                future['bins'].append(error)
            else:
                # Now, we're gonna put them in the thing.
                value = self.iteration+1 
                future['weights'].append(iter_data['weights'][children])
                future['pcoord'].append(iter_data['pcoord'][...][children, :, :])
                try:
                    aux_data = iter_data['auxdata'][...][children, :, :]
                    try:
                        future['aux_data'].append(aux_data)
                    except:
                        future['aux_data'] = aux_data
                except:
                    pass
                future['parents'].append(iter_data['parents'][children])
                future['seg_id'].append(iter_data['seg_id'][children])
                future['states'].append(self.assign['trajlabels'][value-1, children, :])
                future['bins'].append(self.assign['assignments'][value-1, children, :])
        return future

    def go(self):
        '''
        Function automatically called by main() when launched via the command line interface.
        Generally, call main, not this function.
        '''
        print("")
        print("Welcome to w_ipa (WESTPA Interactive Python Analysis) v. {}!".format(w.version))
        print("Run w.introduction for a more thorough introduction, or w.help to see a list of options.")
        print("Running analysis & loading files.")
        self.data_reader.open()
        self.analysis_structure()
        # Seems to be consistent with other tools, such as w_assign.  For setting the iterations.
        self.data_reader.open()
        self.niters = self.data_reader.current_iteration - 1
        self.iteration = self.niters
        try:
            print('Your current scheme, system and iteration are : {}, {}, {}'.format(w.scheme, os.getcwd(), w.iteration))
        except:
            pass

    @property
    def introduction(self):
        '''
        Just spits out an introduction, in case someone doesn't call help.
        '''
        help_string = '''
        Call as a dictionary item or a .attribute:

        w.past, w.current, w.future:
            
            {current}

        Raw schemes can be accessed as follows:

            w.scheme.{scheme_keys}

            and contain mostly the same datasets associated with w.

        The following give raw access to the h5 files associated with the current scheme

        w.west
        w.assign
        w.direct
        w.reweight

        OTHER:

        {w}

        '''.format(current=self.__format_keys__(self.current.__dir__(), split=' ', offset=12), scheme_keys=self.__format_keys__(list(self._scheme.raw.keys())),
                   w=self.__format_keys__(self.__dir__(), offset=8, max_length=0, split='', prepend='w.'))
        print(help_string)

    # Just a little function to be used with the introduction.
    def __format_keys__(self, keys, split='/', offset=0, max_length=80, prepend=''):
        rtn = ''
        run_length = 0
        for key in keys:
            rtn += prepend + str(key) + split
            run_length += len(str(key))
            if run_length >= max_length:
                run_length = offset
                rtn += '\n' + ' '*offset
        if rtn[-1] == split:
            return rtn[:-1]
        else:
            return rtn

    @property
    def help(self):
        ''' Just a minor function to call help on itself.  Only in here to really help someone get help.'''
        help(self)

    def _repr_pretty_(self, p, cycle):
        self.introduction
        return " "

    def __dir__(self):
        return_list = ['past', 'current', 'future']
        # For the moment, don't expose direct, reweight, or assign, as these are scheme dependent files.
        # They do exist, and always link to the current scheme, however.
        return_list += ['iteration', 'niters', 'scheme', 'list_schemes', 'bin_labels', 'state_labels', 'west', 'trace']
        return sorted(set(return_list))
Пример #46
0
class WFluxanlTool(WESTTool):
    prog = 'w_fluxanl'
    description = '''\
Extract fluxes into pre-defined target states from WEST data,
average, and construct confidence intervals. Monte Carlo bootstrapping
is used to account for the correlated and possibly non-Gaussian statistical
error in flux measurements.

All non-graphical output (including that to the terminal and HDF5) assumes that
the propagation/resampling period ``tau`` is equal to unity; to obtain results
in familiar units, divide all fluxes and multiply all correlation lengths by
the true value of ``tau``.
'''

    output_format_version = 2

    def __init__(self):
        super(WFluxanlTool, self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}

        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='fluxanl.h5',
            help=
            'Store intermediate data and analysis results to OUTPUT (default: %(default)s).'
        )
        cgroup = parser.add_argument_group('calculation options')
        cgroup.add_argument(
            '--disable-bootstrap',
            '-db',
            dest='bootstrap',
            action='store_const',
            const=False,
            help='''Enable the use of Monte Carlo Block Bootstrapping.''')
        cgroup.add_argument('--disable-correl',
                            '-dc',
                            dest='correl',
                            action='store_const',
                            const=False,
                            help='''Disable the correlation analysis.''')
        cgroup.add_argument(
            '-a',
            '--alpha',
            type=float,
            default=0.05,
            help=
            '''Calculate a (1-ALPHA) confidence interval on the average flux'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '-N',
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )
        cgroup.add_argument(
            '--evol',
            action='store_true',
            dest='do_evol',
            help=
            '''Calculate time evolution of flux confidence intervals (expensive).'''
        )
        cgroup.add_argument(
            '--evol-step',
            type=int,
            default=1,
            metavar='ESTEP',
            help=
            '''Calculate time evolution of flux confidence intervals every ESTEP
                            iterations (default: %(default)s)''')

    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.iter_range.data_manager = self.data_reader
        self.iter_range.process_args(args)

        self.output_h5file = h5py.File(args.output, 'w')

        self.alpha = args.alpha
        # Disable the bootstrap or the correlation analysis.
        self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True
        self.do_correl = args.correl if args.correl is not None else True
        self.autocorrel_alpha = args.acalpha or self.alpha
        self.n_sets = args.nsets or mclib.get_bssize(self.alpha)

        self.do_evol = args.do_evol
        self.evol_step = args.evol_step or 1

    def calc_store_flux_data(self):
        westpa.rc.pstatus(
            'Calculating mean flux and confidence intervals for iterations [{},{})'
            .format(self.iter_range.iter_start, self.iter_range.iter_stop))

        fluxdata = extract_fluxes(self.iter_range.iter_start,
                                  self.iter_range.iter_stop, self.data_reader)

        # Create a group to store data in
        output_group = h5io.create_hdf5_group(self.output_h5file,
                                              'target_flux',
                                              replace=False,
                                              creating_program=self.prog)
        self.output_group = output_group
        output_group.attrs['version_code'] = self.output_format_version
        self.iter_range.record_data_iter_range(output_group)

        n_targets = len(fluxdata)
        index = numpy.empty((len(fluxdata), ), dtype=target_index_dtype)
        avg_fluxdata = numpy.empty((n_targets, ), dtype=ci_dtype)

        for itarget, (target_label,
                      target_fluxdata) in enumerate(fluxdata.items()):
            # Create group and index entry
            index[itarget]['target_label'] = str(target_label)
            target_group = output_group.create_group(
                'target_{}'.format(itarget))

            self.target_groups[target_label] = target_group

            # Store per-iteration values
            target_group['n_iter'] = target_fluxdata['n_iter']
            target_group['count'] = target_fluxdata['count']
            target_group['flux'] = target_fluxdata['flux']
            h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1'])

            # Calculate flux autocorrelation
            fluxes = target_fluxdata['flux']
            mean_flux = fluxes.mean()
            fmm = fluxes - mean_flux
            acorr = fftconvolve(fmm, fmm[::-1])
            acorr = acorr[len(acorr) // 2:]
            acorr /= acorr[0]
            acorr_ds = target_group.create_dataset('flux_autocorrel',
                                                   data=acorr)
            h5io.label_axes(acorr_ds, ['lag'], ['tau'])

            # Calculate overall averages and CIs
            #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
            #                                                     autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean)
            avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl(
                {'dataset': fluxes},
                estimator=(lambda stride, dataset: numpy.mean(dataset)),
                alpha=self.alpha,
                n_sets=self.n_sets,
                autocorrel_alpha=self.autocorrel_alpha,
                subsample=numpy.mean,
                do_correl=self.do_correl,
                mcbs_enable=self.mcbs_enable)
            avg_fluxdata[itarget] = (self.iter_range.iter_start,
                                     self.iter_range.iter_stop, avg, lb_ci,
                                     ub_ci, sterr, correl_len)
            westpa.rc.pstatus('target {!r}:'.format(target_label))
            westpa.rc.pstatus(
                '  correlation length = {} tau'.format(correl_len))
            westpa.rc.pstatus(
                '  mean flux and CI   = {:e} ({:e},{:e}) tau^(-1)'.format(
                    avg, lb_ci, ub_ci))
            index[itarget]['mean_flux'] = avg
            index[itarget]['mean_flux_ci_lb'] = lb_ci
            index[itarget]['mean_flux_ci_ub'] = ub_ci
            index[itarget]['mean_flux_correl_len'] = correl_len

        # Write index and summary
        index_ds = output_group.create_dataset('index', data=index)
        index_ds.attrs['mcbs_alpha'] = self.alpha
        index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
        index_ds.attrs['mcbs_n_sets'] = self.n_sets

        self.fluxdata = fluxdata
        self.output_h5file['avg_flux'] = avg_fluxdata

    def calc_evol_flux(self):
        westpa.rc.pstatus(
            'Calculating cumulative evolution of flux confidence intervals every {} iteration(s)'
            .format(self.evol_step))

        for itarget, (target_label,
                      target_fluxdata) in enumerate(self.fluxdata.items()):
            fluxes = target_fluxdata['flux']
            target_group = self.target_groups[target_label]
            iter_start = target_group['n_iter'][0]
            iter_stop = target_group['n_iter'][-1]
            iter_count = iter_stop - iter_start
            n_blocks = iter_count // self.evol_step
            if iter_count % self.evol_step > 0: n_blocks += 1

            cis = numpy.empty((n_blocks, ), dtype=ci_dtype)

            for iblock in range(n_blocks):
                block_iter_stop = min(
                    iter_start + (iblock + 1) * self.evol_step, iter_stop)
                istop = min((iblock + 1) * self.evol_step,
                            len(target_fluxdata['flux']))
                fluxes = target_fluxdata['flux'][:istop]

                #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
                #                                                     autocorrel_alpha = self.autocorrel_alpha,
                #                                                     subsample=numpy.mean)
                avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl(
                    {'dataset': fluxes},
                    estimator=(lambda stride, dataset: numpy.mean(dataset)),
                    alpha=self.alpha,
                    n_sets=self.n_sets,
                    autocorrel_alpha=self.autocorrel_alpha,
                    subsample=numpy.mean,
                    do_correl=self.do_correl,
                    mcbs_enable=self.mcbs_enable)
                cis[iblock]['iter_start'] = iter_start
                cis[iblock]['iter_stop'] = block_iter_stop
                cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock][
                    'ci_ubound'] = avg, ci_lb, ci_ub
                cis[iblock]['corr_len'] = correl_len
                cis[iblock]['sterr'] = sterr

                del fluxes

            cis_ds = target_group.create_dataset('flux_evolution', data=cis)
            cis_ds.attrs['iter_step'] = self.evol_step
            cis_ds.attrs['mcbs_alpha'] = self.alpha
            cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
            cis_ds.attrs['mcbs_n_sets'] = self.n_sets

    def go(self):
        self.calc_store_flux_data()
        if self.do_evol:
            self.calc_evol_flux()
Пример #47
0
class WPDist(WESTParallelTool):
    prog = 'w_pdist'
    description = '''\
Calculate time-resolved, multi-dimensional probability distributions of WE 
datasets.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.
    
  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.
    
  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``numpy`` available for
    use [e.g. to specify bins using numpy.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.
    
  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.
    
  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.
    
  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WPDist, self).__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None

        self.dsspec = None
        self.wt_dsspec = None  # dsspec for weights

        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension
        self.data_range = None  # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)

        self.iter_range.add_args(parser)

        parser.add_argument(
            '-b',
            '--bins',
            dest='bins',
            metavar='BINEXPR',
            default='100',
            help=
            '''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)'''
        )

        parser.add_argument(
            '-o',
            '--output',
            dest='output',
            default='pdist.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')
        parser.add_argument(
            '-C',
            '--compress',
            action='store_true',
            help=
            '''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''')

        parser.add_argument(
            '--loose',
            dest='ignore_out_of_range',
            action='store_true',
            help=
            '''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''')

        igroup = parser.add_argument_group(
            'input dataset options').add_mutually_exclusive_group(
                required=False)

        igroup.add_argument(
            '--construct-dataset',
            help=
            '''Use the given function (as in module.function) to extract source data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id][timepoint][dimension]''')

        igroup.add_argument(
            '--dsspecs',
            nargs='+',
            metavar='DSSPEC',
            help=
            '''Construct probability distribution from one or more DSSPECs.''')

        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        self.input_dssynth.h5filename = self.data_reader.we_h5filename
        self.input_dssynth.process_args(args)
        self.dsspec = self.input_dssynth.dsspec

        # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library
        # Open the WEST HDF5 file just long enough to process our iteration range, then close
        # and reopen in go() [which executes after the fork]
        with self.data_reader:
            self.iter_range.process_args(args)

        self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename,
                                          'seg_index',
                                          slice=numpy.index_exp['weight'])

        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False

    def go(self):
        self.data_reader.open('r')
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)

            self.iter_start = self.iter_range.iter_start
            self.iter_stop = self.iter_range.iter_stop

            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(
                    zip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints

            # construct histogram
            self.construct_histogram()

            # Record iteration range
            iter_range = self.iter_range.iter_range()
            self.output_file['n_iter'] = iter_range
            self.iter_range.record_data_iter_range(
                self.output_file['histograms'])

            self.output_file.close()

    @staticmethod
    def parse_binspec(binspec):
        namespace = {'numpy': numpy, 'inf': float('inf')}

        try:
            binspec_compiled = eval(binspec, namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled

    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:
        
          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)
          
        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to 
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''

        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))

    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.dsspec.get_iter_data(self.iter_start)
            self.ntimepoints = dset.shape[1]
            self.ndim = dset.shape[2]
            self.dset_dtype = dset.dtype

    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''

        self.progress.indicator.new_operation('Scanning for data range',
                                              self.iter_stop - self.iter_start)
        self.scan_data_shape()

        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.dsspec

        try:
            minval = numpy.finfo(dset_dtype).min
            maxval = numpy.finfo(dset_dtype).max
        except ValueError:
            minval = numpy.iinfo(dset_dtype).min
            maxval = numpy.iinfo(dset_dtype).max

        data_range = self.data_range = [(maxval, minval)
                                        for _i in range(self.ndim)]

        #futures = []
        #for n_iter in xrange(self.iter_start, self.iter_stop):
        #_remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
            ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {})
             for n_iter in range(self.iter_start, self.iter_stop)),
                self.max_queue_len):
            bounds = future.get_result(discard=True)
            for idim in range(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = numpy.linspace(lb, ub, bins + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = numpy.linspace(lb, ub, bins[idim] + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = numpy.asarray(boundset)
            if (numpy.diff(boundset) <= 0).any():
                raise ValueError(
                    'boundary set {!r} is not strictly monotonically increasing'
                    .format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0)

    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''

        self.scan_data_shape()

        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset(
            'histograms',
            dtype=numpy.float64,
            shape=((iter_count, ) +
                   tuple(len(bounds) - 1 for bounds in self.binbounds)),
            compression=9 if self.compress_output else None)
        binbounds = [
            numpy.require(boundset, self.dset_dtype, 'C')
            for boundset in self.binbounds
        ]

        self.progress.indicator.new_operation('Constructing histograms',
                                              self.iter_stop - self.iter_start)
        task_gen = (
            (_remote_bin_iter,
             (iiter, n_iter, self.dsspec, self.wt_dsspec,
              1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {})
            for (iiter,
                 n_iter) in enumerate(range(self.iter_start, self.iter_stop)))
        #futures = set()
        #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))

        #for future in self.work_manager.as_completed(futures):
        #future = self.work_manager.wait_any(futures)
        #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(
                task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future
Пример #48
0
class WBinTool(WESTTool):
    prog='w_bins'
    description = '''\
Display information and statistics about binning in a WEST simulation, or
modify the binning for the current iteration of a WEST simulation.        
-------------------------------------------------------------------------------
'''    
    def __init__(self):
        super(WBinTool,self).__init__()
        self.subcommand = None
        self.data_reader = WESTDataReader() 
        self.binning = BinMappingComponent()
        self.args = None
        self.n_iter = None
                
    # Interface for command-line tools
    def add_args(self, parser):
        self.data_reader.add_args(parser)
        
        
        subparsers = parser.add_subparsers(help='available commands')
        
        info_parser = subparsers.add_parser('info', help='Display information about binning.')
        info_parser.add_argument('-n', '--n-iter', type=int, 
                                 help='''Consider initial points of segment N_ITER (default: current iteration).''')
        info_parser.add_argument('--detail', action='store_true',
                                 help='''Display detailed per-bin information in addition to summary
                                 information.''')
        self.binning.add_args(info_parser)
        info_parser.set_defaults(func=self.cmd_info)
        
        rebin_parser = subparsers.add_parser('rebin',help='Rebuild current iteration with new binning.')
        rebin_parser.add_argument('--confirm', action='store_true', 
                                  help='''Commit the revised iteration to HDF5; without this option, the effects of the
                                  new binning are only calculated and printed.''')
        rebin_parser.add_argument('--detail', action='store_true',
                                  help='''Display detailed per-bin information in addition to summary
                                     information.''')
        self.binning.add_args(rebin_parser, suppress=['--bins-from-file'])
        self.binning.add_target_count_args(rebin_parser)
        rebin_parser.set_defaults(func=self.cmd_rebin)
            
    def process_args(self, args):

        self.data_reader.process_args(args)
        self.data_reader.open(mode='r+')
        self.n_iter = getattr(args,'n_iter',None) or self.data_reader.current_iteration
        
        # we cannot read bin information during rebins
        # interesting note: '==' is required here; 'is' fails
        if args.func == self.cmd_rebin:
            self.binning.target_counts_required = True
        else:
            self.binning.set_we_h5file_info(self.n_iter, self.data_reader)    
        
        self.binning.process_args(args)
        
        self.args = args
        self.subcommand = args.func
        
    def go(self):
        self.subcommand()
        
    def cmd_info(self):
        mapper = self.binning.mapper
        
        # Get target states and their assignments
        target_states = self.data_reader.get_target_states(self.n_iter)
        n_target_states = len(target_states)
        
        iter_group = self.data_reader.get_iter_group(self.n_iter)
        
        # bin initial pcoords for iteration n_iter
        initial_pcoords = iter_group['pcoord'][:,0,:]
        assignments = mapper.assign(initial_pcoords)
        del initial_pcoords
        
        print('Bin information for iteration {:d}'.format(self.n_iter))
        
        # Get bin counts and weights
        weights = iter_group['seg_index']['weight']
        
        write_bin_info(mapper, assignments, weights, n_target_states, detailed=self.args.detail)
            
    def cmd_rebin(self):
        mapper = self.binning.mapper
        assert mapper is not None    
        if self.n_iter == 1:
            sys.stderr.write('rebin is not supported for the first iteration; reinitialize with w_init instead\n')
            sys.exit(1)
        n_target_states = len(self.data_reader.get_target_states(self.n_iter))
        we_driver = westpa.rc.get_we_driver()
        data_manager = self.data_reader.data_manager
        
        segments = data_manager.get_segments(self.n_iter,load_pcoords=True)
        last_iter_segments = data_manager.get_segments(self.n_iter-1,load_pcoords=False,load_auxdata=False)
                
        # Bin on this iteration's initial points
        # We don't have to worry about recycling because we are binning on
        # initial points rather than final points, so recycling has already
        # occurred for this iteration.
        # We do need initial states, in case we merge a newly-created walker out of existence
        #avail_initial_states = {state.state_id: state
        #                        for state in data_manager.get_unused_initial_states(n_iter = self.n_iter)}
        avail_initial_states = data_manager.get_unused_initial_states(n_iter = self.n_iter)
        used_initial_states = data_manager.get_segment_initial_states(segments)
        we_driver.new_iteration(initial_states=avail_initial_states,
                                bin_mapper=mapper, bin_target_counts=self.binning.bin_target_counts)
        we_driver.used_initial_states = {state.state_id: state for state in used_initial_states}
        we_driver.assign(segments,initializing=True)
        we_driver.rebin_current(parent_segments=last_iter_segments)
        
        weights = numpy.array([segment.weight for segment in we_driver.next_iter_segments])
        assignments = numpy.fromiter(we_driver.next_iter_assignments,dtype=int,count=len(weights))
        write_bin_info(mapper, assignments, weights, n_target_states, detailed=self.args.detail)
        
        if self.args.confirm:
            data_manager.prepare_iteration(self.n_iter, list(we_driver.next_iter_segments))
            
            # manually update endpoint statuses only
            endpoint_types = sorted([(segment.seg_id, segment.endpoint_type) for segment in last_iter_segments])
            last_iter_group = data_manager.get_iter_group(self.n_iter-1)
            last_iter_index = last_iter_group['seg_index'][...]
            last_iter_index['endpoint_type'] = [pair[1] for pair in endpoint_types]
            last_iter_group['seg_index'][...] = last_iter_index
            
            data_manager.save_iter_binning(self.n_iter, self.binning.mapper_hash, self.binning.mapper_pickle,
                                           we_driver.bin_target_counts)
            data_manager.update_initial_states(we_driver.all_initial_states)
            data_manager.flush_backing()
Пример #49
0
 def __init__(self):
     super(WDumpSegs,self).__init__()
     self.data_reader = WESTDataReader()
     self.n_iter = None
     self.output_file = None
     self.print_pcoords = False
Пример #50
0
class WNTopTool(WESTTool):
    prog='w_ntop'
    description = '''\
Select walkers from bins . An assignment file mapping walkers to
bins at each timepoint is required (see``w_assign --help`` for further 
information on generating this file). By default, high-weight walkers are
selected (hence the name ``w_ntop``: select the N top-weighted walkers from
each bin); however, minimum weight walkers and randomly-selected walkers
may be selected instead.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "ntop.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration][bin]
    *(Integer)* Number of segments in each bin/state in the given iteration.
    This will generally be the same as the number requested with
    ``--n/--count`` but may be smaller if the requested number of walkers
    does not exist.

  ``/seg_ids`` [iteration][bin][segment]
    *(Integer)* Matching segments in each iteration for each bin.
    For an iteration ``n_iter``, only the first ``n_iter`` entries are
    valid. For example, the full list of matching seg_ids in bin 0 in the 
    first stored iteration is ``seg_ids[0][0][:n_segs[0]]``.

  ``/weights`` [iteration][bin][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WNTopTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        
        igroup = parser.add_argument_group('input options')
        igroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Use assignments from the given ASSIGNMENTS file (default: %(default)s).''')

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument('-n', '--count', type=int, default=1,
                            help='''Select COUNT walkers from each iteration for each bin (default: %(default)s).''')
        sgroup.add_argument('-t', '--timepoint', type=int, default=-1,
                            help='''Base selection on the given TIMEPOINT within each iteration. Default (-1)
                            corresponds to the last timepoint.''')
        cgroup = parser.add_mutually_exclusive_group()
        cgroup.add_argument('--highweight', dest='select_what', action='store_const', const='highweight',
                            help='''Select COUNT highest-weight walkers from each bin.''')
        cgroup.add_argument('--lowweight', dest='select_what', action='store_const', const='lowweight',
                            help='''Select COUNT lowest-weight walkers from each bin.''')
        cgroup.add_argument('--random', dest='select_what', action='store_const', const='random',
                            help='''Select COUNT walkers randomly from each bin.''')
        parser.set_defaults(select_what='highweight')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='ntop.h5',
                            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.what = args.select_what
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.count = args.count
        self.timepoint = args.timepoint

    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins']+1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(assignments_file['nsegs'], iter_start,iter_stop)]

        output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop))

        seg_count_ds = output_file.create_dataset('nsegs', dtype=numpy.uint, shape=(iter_count,nbins))
        matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,nbins,count),
                                                      dtype=seg_id_dtype,
                                                      chunks=h5io.calc_chunksize((iter_count,nbins,count), seg_id_dtype),
                                                      shuffle=True, compression=9)
        weights_ds = output_file.create_dataset('weights', shape=(iter_count,nbins,count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize((iter_count,nbins,count), weight_dtype),
                                                shuffle=True,compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(xrange(iter_start, iter_stop)):
                assignments = numpy.require(assignments_ds[h5io.get_iteration_entry(assignments_ds, n_iter)
                                                           + numpy.index_exp[:,timepoint]], dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(nsegs[iiter],nbins,assignments)
                for ibin in xrange(nbins):
                    segs = numpy.nonzero(segs_by_bin[:,ibin])[0]

                    seg_count_ds[iiter,ibin] = min(len(segs),count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = numpy.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = numpy.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = numpy.random.permutation(len(weights))

                        matching_segs_ds[iiter,ibin,:len(segs)] = segs.take(indices)
                        weights_ds[iiter,ibin,:len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
Пример #51
0
 def __init__(self):
     super(WDumpSegs, self).__init__()
     self.data_reader = WESTDataReader()
     self.n_iter = None
     self.output_file = None
     self.print_pcoords = False
Пример #52
0
class KinAvgSubcommands(WESTSubcommand):
    '''Common argument processing for w_kinavg subcommands'''
    def __init__(self, parent):
        super(KinAvgSubcommands, self).__init__(parent)

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        # self.default_kinetics_file will be picked up as a class attribute from the appropriate subclass
        iogroup.add_argument(
            '-k',
            '--kinetics',
            default=self.default_kinetics_file,
            help='''Populations and transition rates are stored in KINETICS
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='kinavg.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cgroup = parser.add_argument_group(
            'confidence interval calculation options')
        cgroup.add_argument('--alpha',
                            type=float,
                            default=0.05,
                            help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked', 'none'],
            default='none',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.'''
        )
        cogroup.add_argument(
            '--window-frac',
            type=float,
            default=1.0,
            help=
            '''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.'''
        )

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(
            self.kinetics_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError(
                'kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(
            self.mcbs_alpha)

        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError(
                'Parameter error -- fractional window defined by --window-frac must be in (0,1]'
            )
Пример #53
0
class WPostAnalysisReweightTool(WESTTool):
    prog = 'w_postanalysis_reweight'
    description = '''\
Calculate average rates from weighted ensemble data using the postanalysis
reweighting scheme. Bin assignments (usually "assignments.h5") and pre-calculated 
iteration flux matrices (usually "flux_matrices.h5") data files must have been 
previously generated using w_postanalysis_matrix.py (see "w_assign --help" and 
"w_kinetics --help" for information on generating these files).


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "kinrw.h5") contains the following
dataset:

  /state_prob_evolution [window,state]
    The reweighted state populations based on windows

  /color_prob_evolution [window,state]
    The reweighted populations last assigned to each state based on windows

  /bin_prob_evolution [window, bin]
    The reweighted populations of each bin based on windows. Bins contain
    one color each, so to recover the original un-colored spatial bins,
    one must sum over all states.

  /conditional_flux_evolution [window,state,state]
    (Structured -- see below). State-to-state fluxes based on windows of
    varying width
    
The structure of the final dataset is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        iogroup.add_argument(
            '-k',
            '--kinetics',
            default='flux_matrices.h5',
            help=
            '''Per-iteration flux matrices calculated by w_postanalysis_matrix 
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='kinrw.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked'],
            default='cumulative',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.''')
        cogroup.add_argument(
            '--window-frac',
            type=float,
            default=1.0,
            help=
            '''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.'''
        )

        cogroup.add_argument(
            '--obs-threshold',
            type=int,
            default=1,
            help=
            '''The minimum number of observed transitions between two states i and j necessary to include
                             fluxes in the reweighting estimate''')

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(
            self.kinetics_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError(
                'kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics

        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError(
                'Parameter error -- fractional window defined by --window-frac must be in (0,1]'
            )
        self.obs_threshold = args.obs_threshold

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.assignments_file.attrs['nstates']
            nbins = self.assignments_file.attrs['nbins']
            state_labels = self.assignments_file['state_labels'][...]
            state_map = self.assignments_file['state_map'][...]
            nfbins = self.kinetics_file.attrs['nrows']
            npts = self.kinetics_file.attrs['npts']

            assert nstates == len(state_labels)
            assert nfbins == nbins * nstates

            start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step

            start_pts = range(start_iter, stop_iter, step_iter)
            flux_evol = np.zeros((len(start_pts), nstates, nstates),
                                 dtype=ci_dtype)
            color_prob_evol = np.zeros((len(start_pts), nstates))
            state_prob_evol = np.zeros((len(start_pts), nstates))
            bin_prob_evol = np.zeros((len(start_pts), nfbins))
            pi.new_operation('Calculating flux evolution', len(start_pts))

            if self.evolution_mode == 'cumulative' and self.evol_window_frac == 1.0:
                print('Using fast streaming accumulation')

                total_fluxes = np.zeros((nfbins, nfbins), weight_dtype)
                total_obs = np.zeros((nfbins, nfbins), np.int64)

                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    stop = min(start + step_iter, stop_iter)

                    params = dict(start=start,
                                  stop=stop,
                                  nstates=nstates,
                                  nbins=nbins,
                                  state_labels=state_labels,
                                  state_map=state_map,
                                  nfbins=nfbins,
                                  total_fluxes=total_fluxes,
                                  total_obs=total_obs,
                                  h5file=self.kinetics_file,
                                  obs_threshold=self.obs_threshold)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(
                        **params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][
                                k, j] = rw_state_flux[k, j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k, j] = start
                            flux_evol[iblock]['iter_stop'][k, j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs

            else:
                for iblock, start in enumerate(start_pts):
                    pi.progress += 1

                    stop = min(start + step_iter, stop_iter)
                    if self.evolution_mode == 'cumulative':
                        windowsize = max(
                            1,
                            int(self.evol_window_frac * (stop - start_iter)))
                        block_start = max(start_iter, stop - windowsize)
                    else:  # self.evolution_mode == 'blocked'
                        block_start = start

                    params = dict(start=block_start,
                                  stop=stop,
                                  nstates=nstates,
                                  nbins=nbins,
                                  state_labels=state_labels,
                                  state_map=state_map,
                                  nfbins=nfbins,
                                  total_fluxes=None,
                                  total_obs=None,
                                  h5file=self.kinetics_file)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(
                        **params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][
                                k, j] = rw_state_flux[k, j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k, j] = start
                            flux_evol[iblock]['iter_stop'][k, j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs

            ds_flux_evol = self.output_file.create_dataset(
                'conditional_flux_evolution',
                data=flux_evol,
                shuffle=True,
                compression=9)
            ds_state_prob_evol = self.output_file.create_dataset(
                'state_prob_evolution', data=state_prob_evol, compression=9)
            ds_color_prob_evol = self.output_file.create_dataset(
                'color_prob_evolution', data=color_prob_evol, compression=9)
            ds_bin_prob_evol = self.output_file.create_dataset(
                'bin_prob_evolution', data=bin_prob_evol, compression=9)
            ds_state_labels = self.output_file.create_dataset(
                'state_labels', data=state_labels)
Пример #54
0
class KinAvgSubcommands(WESTSubcommand):
    '''Common argument processing for w_kinavg subcommands'''
    
    def __init__(self, parent):
        super(KinAvgSubcommands,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
        
    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        # self.default_kinetics_file will be picked up as a class attribute from the appropriate subclass        
        iogroup.add_argument('-k', '--kinetics', default=self.default_kinetics_file,
                            help='''Populations and transition rates are stored in KINETICS
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='kinavg.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        
        cgroup = parser.add_argument_group('confidence interval calculation options')
        cgroup.add_argument('--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        
        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked', 'none'], default='none',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.''')
        cogroup.add_argument('--window-frac', type=float, default=1.0,
                             help='''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError('kinetics data do not span the requested iterations')

    
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics
                
        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(self.mcbs_alpha)
        
        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError('Parameter error -- fractional window defined by --window-frac must be in (0,1]')
Пример #55
0
class WSelectTool(WESTParallelTool):
    prog='w_select'
    description = '''\
Select dynamics segments matching various criteria. This requires a
user-provided prediate function. By default, only matching segments are
stored. If the -a/--include-ancestors option is given, then matching segments
and their ancestors will be stored.


-----------------------------------------------------------------------------
Predicate function
-----------------------------------------------------------------------------

Segments are selected based on a predicate function, which must be callable
as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs
matching the predicate in that iteration.

The predicate may be inverted by specifying the -v/--invert command-line
argument.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "select.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration]
    *(Integer)* Number of segment IDs matching the predicate (or inverted
    predicate, if -v/--invert is specified) in the given iteration.

  ``/seg_ids`` [iteration][segment]
    *(Integer)* Matching segments in each iteration. For an iteration
    ``n_iter``, only the first ``n_iter`` entries are valid. For example,
    the full list of matching seg_ids in the first stored iteration is
    ``seg_ids[0][:n_segs[0]]``.

  ``/weights`` [iteration][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WSelectTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument('-p', '--predicate-function', metavar='MODULE.FUNCTION',
                             help='''Use the given predicate function to match segments. This function
                             should take an iteration number and the HDF5 group corresponding to that
                             iteration and return a sequence of seg_ids matching the predicate, as in
                             ``match_predicate(n_iter, iter_group)``.''')
        sgroup.add_argument('-v', '--invert', dest='invert', action='store_true',
                            help='''Invert the match predicate.''')
        sgroup.add_argument('-a', '--include-ancestors', action ='store_true',
                            help='''Include ancestors of matched segments in output.''')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='select.h5',
                            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        predicate = get_object(args.predicate_function,path=['.'])
        if not callable(predicate):
            raise TypeError('predicate object {!r} is not callable'.format(predicate))
        self.predicate = predicate
        self.invert = bool(args.invert)
        self.include_ancestors = bool(args.include_ancestors)
        self.output_filename = args.output

    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs', dtype=numpy.uint, shape=(iter_count,))
        matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,0), maxshape=(iter_count,None),
                                                      dtype=seg_id_dtype,
                                                      chunks=h5io.calc_chunksize((iter_count,1000000), seg_id_dtype),
                                                      shuffle=True, compression=9)
        weights_ds = output_file.create_dataset('weights', shape=(iter_count,0), maxshape=(iter_count,None),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize((iter_count,1000000), weight_dtype),
                                                shuffle=True,compression=9)

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
#             futures = set()
#             for n_iter in xrange(iter_start,iter_stop):
#                 futures.add(self.work_manager.submit(_find_matching_segments, 
#                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

#             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(((_find_matching_segments,
                                                                  (self.data_reader.we_h5filename,n_iter,self.predicate,self.invert),
                                                                  {}) for n_iter in xrange(iter_start,iter_stop)),
                                                                self.max_queue_len):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count,n_matches))
                        weights_ds.resize((iter_count,n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter-iter_start] = n_matches
                    matching_segs_ds[n_iter-iter_start,:n_matches] = matching_ids
                    weights_ds[n_iter-iter_start,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments', extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in xrange(iter_stop-1, iter_start-1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count,n_matches))
                        weights_ds.resize((iter_count,n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter,:n_matches] = matching_ids
                        weights_ds[iiter,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)['seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
Пример #56
0
class WBinTool(WESTTool):
    prog = 'w_bins'
    description = '''\
Display information and statistics about binning in a WEST simulation, or
modify the binning for the current iteration of a WEST simulation.        
-------------------------------------------------------------------------------
'''

    def __init__(self):
        super(WBinTool, self).__init__()
        self.subcommand = None
        self.data_reader = WESTDataReader()
        self.binning = BinMappingComponent()
        self.args = None
        self.n_iter = None

    # Interface for command-line tools
    def add_args(self, parser):
        self.data_reader.add_args(parser)

        subparsers = parser.add_subparsers(help='available commands')

        info_parser = subparsers.add_parser(
            'info', help='Display information about binning.')
        info_parser.add_argument(
            '-n',
            '--n-iter',
            type=int,
            help=
            '''Consider initial points of segment N_ITER (default: current iteration).'''
        )
        info_parser.add_argument(
            '--detail',
            action='store_true',
            help='''Display detailed per-bin information in addition to summary
                                 information.''')
        self.binning.add_args(info_parser)
        info_parser.set_defaults(func=self.cmd_info)

        rebin_parser = subparsers.add_parser(
            'rebin', help='Rebuild current iteration with new binning.')
        rebin_parser.add_argument(
            '--confirm',
            action='store_true',
            help=
            '''Commit the revised iteration to HDF5; without this option, the effects of the
                                  new binning are only calculated and printed.'''
        )
        rebin_parser.add_argument(
            '--detail',
            action='store_true',
            help='''Display detailed per-bin information in addition to summary
                                     information.''')
        self.binning.add_args(rebin_parser, suppress=['--bins-from-file'])
        self.binning.add_target_count_args(rebin_parser)
        rebin_parser.set_defaults(func=self.cmd_rebin)

    def process_args(self, args):

        self.data_reader.process_args(args)
        self.data_reader.open(mode='r+')
        self.n_iter = getattr(args, 'n_iter',
                              None) or self.data_reader.current_iteration

        # we cannot read bin information during rebins
        # interesting note: '==' is required here; 'is' fails
        if args.func == self.cmd_rebin:
            self.binning.target_counts_required = True
        else:
            self.binning.set_we_h5file_info(self.n_iter, self.data_reader)

        self.binning.process_args(args)

        self.args = args
        self.subcommand = args.func

    def go(self):
        self.subcommand()

    def cmd_info(self):
        mapper = self.binning.mapper

        # Get target states and their assignments
        target_states = self.data_reader.get_target_states(self.n_iter)
        n_target_states = len(target_states)

        iter_group = self.data_reader.get_iter_group(self.n_iter)

        # bin initial pcoords for iteration n_iter
        initial_pcoords = iter_group['pcoord'][:, 0, :]
        assignments = mapper.assign(initial_pcoords)
        del initial_pcoords

        print('Bin information for iteration {:d}'.format(self.n_iter))

        # Get bin counts and weights
        weights = iter_group['seg_index']['weight']

        write_bin_info(mapper,
                       assignments,
                       weights,
                       n_target_states,
                       detailed=self.args.detail)

    def cmd_rebin(self):
        mapper = self.binning.mapper
        assert mapper is not None
        if self.n_iter == 1:
            sys.stderr.write(
                'rebin is not supported for the first iteration; reinitialize with w_init instead\n'
            )
            sys.exit(1)
        n_target_states = len(self.data_reader.get_target_states(self.n_iter))
        we_driver = westpa.rc.get_we_driver()
        data_manager = self.data_reader.data_manager

        segments = data_manager.get_segments(self.n_iter, load_pcoords=True)
        last_iter_segments = data_manager.get_segments(self.n_iter - 1,
                                                       load_pcoords=False,
                                                       load_auxdata=False)

        # Bin on this iteration's initial points
        # We don't have to worry about recycling because we are binning on
        # initial points rather than final points, so recycling has already
        # occurred for this iteration.
        # We do need initial states, in case we merge a newly-created walker out of existence
        #avail_initial_states = {state.state_id: state
        #                        for state in data_manager.get_unused_initial_states(n_iter = self.n_iter)}
        avail_initial_states = data_manager.get_unused_initial_states(
            n_iter=self.n_iter)
        used_initial_states = data_manager.get_segment_initial_states(segments)
        we_driver.new_iteration(
            initial_states=avail_initial_states,
            bin_mapper=mapper,
            bin_target_counts=self.binning.bin_target_counts)
        we_driver.used_initial_states = {
            state.state_id: state
            for state in used_initial_states
        }
        we_driver.assign(segments, initializing=True)
        we_driver.rebin_current(parent_segments=last_iter_segments)

        weights = numpy.array(
            [segment.weight for segment in we_driver.next_iter_segments])
        assignments = numpy.fromiter(we_driver.next_iter_assignments,
                                     dtype=int,
                                     count=len(weights))
        write_bin_info(mapper,
                       assignments,
                       weights,
                       n_target_states,
                       detailed=self.args.detail)

        if self.args.confirm:
            data_manager.prepare_iteration(self.n_iter,
                                           list(we_driver.next_iter_segments))

            # manually update endpoint statuses only
            endpoint_types = sorted([(segment.seg_id, segment.endpoint_type)
                                     for segment in last_iter_segments])
            last_iter_group = data_manager.get_iter_group(self.n_iter - 1)
            last_iter_index = last_iter_group['seg_index'][...]
            last_iter_index['endpoint_type'] = [
                pair[1] for pair in endpoint_types
            ]
            last_iter_group['seg_index'][...] = last_iter_index

            data_manager.save_iter_binning(self.n_iter,
                                           self.binning.mapper_hash,
                                           self.binning.mapper_pickle,
                                           we_driver.bin_target_counts)
            data_manager.update_initial_states(we_driver.all_initial_states)
            data_manager.flush_backing()
class WPostanalysisPush(WESTTool):
    prog ='w_postanalysis_push'
    description = '''\
Apply weights calculated using the postanalysis reweighting scheme (see 
"w_postanalysis_reweight --help" or "w_multi_reweight --help") to a WESTPA data
file (usually "west.h5") by scaling the probability in each bin.  Bin
assignments (usually "assign.h5") and a corresponding postanalysis reweighting
output file (usually "kinrw.h5" or "multi_rw.h5") must be supplied, in addition
to the iteration from which to pull weights.

WARNING: Output from this script may not be compatible with some tools. Due to
the nature of the postanalysis reweighting process, some walkers may be assigned
zero weight. Additionally, total weight may not sum to one for some iterations, 
as a bin predicted to have nonzero weight by the postanalysis reweighting scheme
may depopulate during the course a simulation. 

--------------------------------------------------------------------------------
Output format
--------------------------------------------------------------------------------

The output file (-o/--output, usually "west_rw.h5") is of the same format as
the original WESTPA data file. New weights are found in place of the original 
weights, located in:

/iterations/iter_{N_ITER:08d}/seg_index/ 

--------------------------------------------------------------------------------
Command-line options
--------------------------------------------------------------------------------
'''

    def __init__(self):
        super(WPostanalysisPush, self).__init__()
        
        self.data_reader = WESTDataReader()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.rw_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.rw_file = None
        self.assignments_file = None

        self.weights_attributes_initialized = False
        self.weights_already_calculated = False
        self.time_average_scaling_vector_calculated = False

        
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-W', '--west', dest='westH5_path', 
                             default='west.h5', metavar='WESTH5',
                             help='''Apply weights to data from WESTH5, creating
                             a new file that either links to or duplicates data
                             in WESTH5 (WESTH5 will not be altered).''')

        iogroup.add_argument('-rw', dest='rw_H5_path', metavar='RW_FILE',
                             default='kinrw.h5',
                             help='''Pull weights from RW_FILE. This should be
                             the output file from either w_postanalysis_reweight 
                             or w_multi_reweight.''')

        iogroup.add_argument('-a', '--assignments', dest='assignH5_path',
                             default='assign.h5', metavar='ASSIGNH5',
                             help='''Rescale weights based on bin assignments
                             in ASSIGNH5. This file should be consistent with 
                             RW_FILE and WESTH5''')

        iogroup.add_argument('-o', '--output', dest='output', 
                             default='west_rw.h5',
                             help='''Store results in OUTPUT 
                             (default: %(default)s).''')

        iogroup.add_argument('-c', '--copy', dest='copy', action='store_true', 
                             help='''If specified, copy all data from WESTH5
                             to OUTPUT.  Otherwise (default), link to data in
                             WESTH5.''')

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-n', '--n-iter', dest='n_iter', default=None,
                             type=int,
                             help='''Pull weights from N_ITER and push to data
                             from all iterations in WESTH5.  By default, use
                             the final iteration available in RW_FILE.
                             Alternatively, the weights from each iteration may
                             be push to the data from the corresponding
                             iteration in WESTH5 (see "-e/--evolution-mode"). 
                             ''')

        cogroup.add_argument('-e', '--evolution-mode', action='store_true', 
                             help='''If specified, push weights from each
                             iteration available in RW_FILE to the data from the
                             corresponding iteration in WESTH5. For iterations
                             not available in RW_FILE, copy weights directly 
                             from WESTH5 without rescaling.  By default, pull 
                             weights from a single iteration, specified using 
                             "-n/--n-iter".''')
        
        cogroup.add_argument('-nc', '--no-color', action='store_true', 
                             dest='no_color',
                             help='''If specified, do not use colored bins for
                             rescaling weights. By default, use colored bins.
                             ''')

        cogroup.add_argument('--time-average', action='store_true',
                             dest='time_average',
                             help='''If specifed, scale weights of walkers the
                             total weight of all walkers in all iterations in a
                             given bin sum to the weight given by the 
                             postanalysis reweighting output.  Weights in a
                             given iteration will likely no longer sum to one. 
                             This options is not compatible with evolution mode
                             (see -e/--evolution-mode).''')

        cogroup.add_argument('--iter-range', dest='iter_range', type=str,
                             default=None,
                             help='''This option may be used only if 
                             --time-average is specified.  ITER_RANGE should be
                             a tuple of the form (first_iter, last_iter).  If
                             this option is specified, use only information on 
                             walkers between first_iter and last_iter 
                             (inclusive) when calculating scaling coefficients.
                             However, all iterations in the WESTPA data file
                             will be rescaled according to these weights.''')
 
                              
                                                         
    def process_args(self, args):
        '''Process the arguments defined in ``add_arguments``, making them 
        available as attributes of the main tool class.'''
        self.progress.process_args(args)
        self.data_reader.process_args(args)
       
        # I/O arguments
        self.westH5_path = args.westH5_path
        self.rwH5_path = args.rw_H5_path
        self.assignH5_path = args.assignH5_path
        self.output_path = args.output

        # Calculation arguments
        self.copy = args.copy
        self.n_iter = args.n_iter
        self.evolution_mode = args.evolution_mode

        if args.no_color:
            self.i_use_color = False
        else:
            self.i_use_color = True

        if args.time_average:
            self.i_time_average = True
        else:
            self.i_time_average = False

        if self.i_time_average and self.evolution_mode:
            raise ArgumentError("Error. Time averaging and evolution modes are "
                                "not compatible! See options "
                                "-e/--evolution-mode and --time-average for "
                                "more information.")  
        self.first_iter = None
        self.last_iter = None
        if args.iter_range is not None:
            if not self.i_time_average:
                raise ArgumentError("Error. Specifying an iteration range is "
                                    "only compatible with time-averaging! See "
                                    "--time-average for more information.")
            iter_tuple = eval(args.iter_range)
            try:
                self.first_iter = int(iter_tuple[0]) 
                self.last_iter = int(iter_tuple[1]) 
            except (ValueError, TypeError):
                raise ArgumentError("An error occurred while parsing the "
                                    "supplied iteration range. The iteration "
                                    "range should evaluate to a Python tuple "
                                    "of integers.  Input: ({:s}). Please see "
                                    "the documentation for the --iter-range "
                                    "option for more information."
                                    .format(args.iter_range)                  )
                                    
            print("Using data between iterations {:d} and {:d} for calculation "
                  "of rescaling vector."
                  .format(self.first_iter, self.last_iter)
                  )


    def open_files(self):
        '''Open the WESTPA data file, the reweighting output file, the
        assignments file, and the output file.''' 
        self.westH5 = h5py.File(self.westH5_path, 'r') 
        self.rwH5 = h5py.File(self.rwH5_path, 'r')
        self.assignments = h5py.File(self.assignH5_path, 'r')
        self.output = h5py.File(self.output_path, 'w')
 

    def check_consistency_of_input_files(self):
        '''Check that the assignment file and west.h5 file have the same number
        of walkers for each iteration, and check that the reweighting output 
        file and the assignment file use the same number of bins.'''
        ## First check that the assignment and west.h5 file have the same ##
        ## number of walkers.                                             ##
        # Assume that the assignments file includes data for ALL iterations.
        # At the time of this code's writing, it must, but this may change
        # later.
        last_iter = self.assignments['assignments'].shape[0] # Zero-indexed 
        # Get the total number of bins.  Indices corresponding to walkers not 
        # present in a given iteration will be assigned this integer.
        n_bins = self.assignments['bin_labels'].shape[0] 
        self.pi.new_operation('Checking input files for consistency',
                              last_iter)

        for iiter in xrange(1, last_iter+1): #iiter is one-indexed
            try:
                iter_group = self.westH5['iterations/iter_{:08d}'.format(iiter)]
            except KeyError:
                raise ConsistencyError("Iteration {:d} exists in {:s} but not "
                                       " in {:s}!".format(iiter,
                                                          self.assignH5_path,
                                                          self.westH5_path)    ) 
            # Number of walkers in this iteration, for the westh5 file.
            westh5_n_walkers = iter_group['seg_index'].shape[0]
            # Number of walkers in this iteration, for the assignh5 file. The
            # size of the assignments array is the same for all iterations, and
            # equals the maximum observed number of walkers.  For iterations
            # with fewer than the maximum number of walkers, the rest of the
            # indices are filled with the integer ``n_bins``. 
            # Switch to zero-indexing, and only look at the first time point.
            assign_n_walkers = np.count_nonzero(
              np.array(self.assignments['assignments'][iiter-1][:,0]) != n_bins 
                                                )
            if not westh5_n_walkers == assign_n_walkers:
                raise ConsistencyError("The number of walkers in the WESTPA "
                                       "data file ({:s}, {:d}) and the number "
                                       "of walkers in the assignments file "
                                       "({:s}, {:d}) for iteration {:d} do not "
                                       "match!".format(self.westH5_path,
                                                       westh5_n_walkers,
                                                       self.assignH5_path,
                                                       assign_n_walkers,
                                                       iiter)                  ) 
            self.pi.progress += 1
        ## Now check that the reweighting output file and the assignment file ##
        ## use the same number of bins.                                       ##
        rw_n_bins = self.rwH5['bin_prob_evolution'].shape[1]
        # rw_n_bins is colored, but n_bins (from the assignments file) is not.
        assign_nstates = self.assignments['state_labels'].shape[0]
        rw_nstates = self.rwH5['state_labels'].shape[0]
        if not assign_nstates == rw_nstates:
            raise ConsistencyError("The number of states used in the "
                                   "assignments file ({:d}) does not match the "
                                   "number of states used in the reweighting "
                                   "file ({:d})!".format(assign_nstates, 
                                                         rw_nstates)           )
        if not assign_nstates*n_bins == rw_n_bins:
            raise ConsistencyError("The number of bins used in the assignments "
                                   "file ({:d}) does not match the number of "
                                   "bins used in the reweighting file ({:d})!."
                                   .format(nbins, rw_n_bins/rw_nstates)        )   
        self.pi.clear()

    def initialize_output(self):
        '''Copy or link datasets besides the seg_index datasets from the input 
        WESTPA data file to the output (reweighted) data file. '''
        self.pi.new_operation('Initializing output file',
                              len(self.westH5['iterations'].keys()))
        for key in self.westH5.keys():
             if key != 'iterations':
                 if self.copy:
                     self.westH5.copy(key, self.output)
                 else:
                     self.output[key] = h5py.ExternalLink(self.westH5_path,
                                                          key)
        for name, val in self.westH5.attrs.items():
            self.output.attrs.create(name, val)
         
        self.output.create_group('iterations')
        for key1 in self.westH5['iterations']:
            self.output.create_group('iterations/{:s}'.format(key1))
            for key2 in self.westH5['iterations/{:s}'.format(key1)]:
                if key2 != 'seg_index':
                    key = 'iterations/'+key1+'/'+key2 
                    if self.copy:
                        self.westH5.copy(key, self.output['iterations/'+key1])  
                    else:
                        self.output[key] = h5py.ExternalLink(self.westH5_path,
                                                             key)
            for name, val in self.westH5['iterations/{:s}'.format(key1)].attrs.items():
                self.output['iterations/{:s}'.format(key1)].attrs.create(name, val) 
            self.pi.progress += 1
        self.pi.clear()


    def get_new_weights(self, n_iter):
        '''Generate and return a length-nbins numpy array representing a vector
        of weights, where weights[i] represents the total weight that should be
        in bin i, based on results from the postanalysis reweighting scheme.'''
        # Build map between indexing of 'conditional_flux_evolution' or
        # 'bin_prob_evolution' (the indexing is the same for both) and 
        # weighted ensemble iteration indices
        if not self.weights_attributes_initialized:
            cfe = self.rwH5['conditional_flux_evolution']
            self.idx_map = np.empty(cfe.shape[0], 
                                    dtype=self.assignments['assignments'].dtype)
            for i in xrange(cfe.shape[0]):
                # Axes are (timepoint index, beginning state, ending state)
                # and final index gets the "iter_stop" data
                # stop_iter is exclusive, so subtract one
                self.idx_map[i] = cfe[i,0,0][1]-1 
            self.weights_attributes_initialized = True

        if (not self.evolution_mode) and (not self.weights_already_calculated):
            idx = np.where(self.idx_map == self.n_iter)
            self.new_weights = np.array(self.rwH5['bin_prob_evolution'])[idx]\
                               .squeeze() 
            self.weights_already_calculated = True
            if self.i_use_color:
                return self.new_weights
            else:
                # Convert colored to non-colored vector. self.nstates should
                # have been set by self.go()
                colored_new_weights = np.copy(self.new_weights)
                self.new_weights = np.zeros(
                        int(self.new_weights.shape[0]/self.nstates)
                                            )
                for i in xrange(self.new_weights.shape[0]):
                    self.new_weights[i] = np.sum(
                        colored_new_weights[self.nstates*i:self.nstates*(i+1)]
                                                 )
                return self.new_weights

        elif (not self.evolution_mode) and self.weights_already_calculated:
            return self.new_weights

        else: # if self.evolution mode:
            idx = np.where(self.idx_map == n_iter) 
            new_weights = np.array(self.rwH5['bin_prob_evolution'][idx])\
                          .squeeze()
            if self.i_use_color:
                return new_weights
            else:
                # Convert colored to non-colored vector. self.nstates should
                # have been set by self.go()
                colored_new_weights = new_weights
                new_weights = np.zeros(new_weights.shape[0]/self.nstates)
                for i in xrange(new_weights.shape[0]/self.nstates):
                    new_weights[i] = np.sum(
                      colored_new_weights[self.nstates*i:self.nstates*(i+1)]
                                            )
                return new_weights

    def get_input_assignments(self, iiter):
        '''Get the bin assignments for the first timepoint of iteration
        ``iiter``, for all walkers in the input assignments file. Return a
        1-dimension numpy array ``assignments``, where ``assignments[i]`` gives
        the bin index assigned to walker i.  Bin indices take into account
        whether or not the colored scheme is used.''' 
        # Get the total weight in each bin for the input assignments file
        # Look only at the first time point! -----------------------------V
        assignments = np.array(self.assignments['assignments'][iiter-1][:,0])
        if self.i_use_color:
            traj_labels = np.array(self.assignments['trajlabels'][iiter-1][:,0])
            assignments = assignments*nstates+traj_labels
        return assignments
               

    def calculate_scaling_coefficients(self, iiter):
        '''Calculate and return a vector of scaling coefficients for 
        the weighted ensemble iteration ``iiter``.  scaling_coefficients[i] 
        gives the value by which to scale (multiply) the weight of any walker 
        in bin i.  

        This method first calls self.get_new_weights(iiter) to find what the
        weights output by the postanalysis reweighting scheme are.  Next, it 
        considers where time averaging is enabled, and whether or not the 
        colored scheme is used.  Finally, it calculates the input assignments
        if necessary.'''
        if not self.i_time_average:
            # Get length-n_bins vector, where new_weights[i] is the total
            # weight in bin i according to the postanalysis reweighting 
            # scheme.
            new_weights = self.get_new_weights(iiter)
            input_assignments = self.get_input_assignments(iiter)
            input_iter_group = self.westH5['iterations/iter_{:08d}'
                                           .format(iiter)]
            seg_weights = np.array(input_iter_group['seg_index']['weight'])
            
            # Calculate the weight in each bin.  the assignments should already
            # have similar information in 'labeled_populations', but these weights
            # invlude ALL time points in each iteration.  In this tool, we must
            # scale the weight of each segment uniformly across any given weighted
            # ensemble iteration, as the weight is only specifed once per iteration.
            # Somewhat arbitrarily, we scale the weights only according to the
            # first timepoint only (we could also choose the another timepoint, or
            # average them perhaps).  For this reason, we need to re-calculate the
            # labelled populations, only looking at the first time point.
            # New weights will already be the correct length (ie, adjusted 
            # for color/no color).
            input_weights = np.zeros(new_weights.shape)
            # Calculate the weight in each bin, in the input WESTPA data
            # file.  This is necessary because w_assign does not calculate
            # populations with color labels.
            for i in xrange(input_weights.shape[0]):
                input_weights[i] = np.sum(
                        seg_weights[np.where(input_assignments == i)] 
                                          )
            # Suppress division errors
            with np.errstate(all='ignore'):
                scaling_coefficients = new_weights/input_weights
                # Set nonsensical value to zero; is this really necessary?
                #scaling_coefficients[~np.isfinite(scaling_coefficients)] = 0
        else: # if self.i_time_average
            if self.time_average_scaling_vector_calculated:
                return self.time_average_scaling_vector
            else: # if not self.time_average_scaling_vector_calculated:
                # Calculate the scaling vector  
                # old_weights will contain the total weight observed in each 
                # bin during the FIRST timepoint of all iterations; this must
                # be calculated here rather than using labeled_populations from
                # the assignments file, as labeled_populations includes all
                # timepoints
                new_weights = self.get_new_weights(iiter)
                old_weights = np.zeros(new_weights.shape)
                if self.first_iter is None:
                    iter_strs = self.westH5['iterations/'].keys().sort()
                    self.first_iter = int(iter_strs[0][5:])
                    self.last_iter_iter = int(iter_strs[-1][5:])
                # Iterate over all WE iterations and sum up the weight in each
                # bin
                for iiter in xrange(self.first_iter, self.last_iter+1):
                    weights = np.array(
                            self.westH5['iterations/iter_{:08d}/seg_index'
                                        .format(iiter)]['weight']
                                       )
                    assignments = self.get_input_assignments(iiter) 
                    for bin_idx in xrange(old_weights.shape[0]):
                        where = np.where(assignments == bin_idx)
                        old_weights[bin_idx] += np.sum(weights[where])
                with np.errstate(all='ignore'):
                    self.time_average_scaling_vector = new_weights/old_weights
                    self.time_average_scaling_vector[
                            ~np.isfinite(self.time_average_scaling_vector)
                                                     ] = 0.0
                self.time_average_scaling_vector_calculated = True
                return self.time_average_scaling_vector
                    

    def go(self):
        '''
        Main function. Calls:
          - self.open_files()
          - self.check_consistency_of_input_files()
          - self.initialize_output()
        and then iterates through all weighted ensemble iterations, rescaling 
        weights of segments and saving a new ``seg_index`` dataset in the output
        file using the rescaled weights.
        '''
        pi = self.progress.indicator
        with pi as self.pi:
            
            # Open files
            self.open_files()
            
            # Check files for consistency
            self.check_consistency_of_input_files() 

            # Initialize the output file.
            self.initialize_output()

            last_iter = self.assignments['assignments'].shape[0] 
            self.nstates = len(self.assignments['state_labels'])

            # If weights are to be pulled from a single iteration, get the weights 
            pi.new_operation('Creating new WESTPA data file with scaled '
                             'weights.', last_iter)
            for iiter in xrange(1, last_iter+1): # iiter is one-indexed 
                scaling_coefficients = self.calculate_scaling_coefficients(iiter)
                input_iter_group = self.westH5['iterations/iter_{:08d}'
                                               .format(iiter)]
                input_assignments = self.get_input_assignments(iiter) 
                # Get the HDF5 group for this iteration. It was already created
                # while initializing the output file.
                output_iter_group = self.output['iterations/iter_{:08d}'
                                                .format(iiter)]

                input_seg_index = np.array(input_iter_group['seg_index']) 
                seg_weights = input_seg_index['weight']
                # Build the new seg_index piece by piece.  Start with an empty
                # list and add data for one segment at a time. Then convert to 
                # a numpy array.
                output_seg_index = []
                for iseg in xrange(input_seg_index.shape[0]):
                    # Only look at the first time point for assignments!
                    bin_idx = input_assignments[iseg]
                    coeff = scaling_coefficients[bin_idx] 
                    output_seg_index.append((seg_weights[iseg]*coeff,)
                                             + tuple(input_seg_index[iseg])[1:])
                output_seg_index = np.array(output_seg_index, 
                                            dtype=input_seg_index.dtype)
                # Save the newly created seg_index (with new weights)
                output_iter_group.create_dataset('seg_index',
                                                 data=output_seg_index,
                                                 dtype=output_seg_index.dtype)
                self.pi.progress += 1