示例#1
0
class KineticsSubcommands(WESTSubcommand):
    '''Base class for common options for both kinetics schemes'''
    def __init__(self, parent):
        super(KineticsSubcommands, self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_file = None
        self.assignments_file = None

        self.do_compression = True

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                                (default: %(default)s).''')
        # default_kinetics_file will be picked up as a class attribute from the appropriate
        # subclass
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default=self.default_kinetics_file,
            help='''Store results in OUTPUT (default: %(default)s).''')
        iogroup.add_argument(
            '--no-compression',
            dest='compression',
            action='store_false',
            help=
            '''Do not store kinetics results compressed. This can increase disk
                             use about 100-fold, but can dramatically speed up subsequent analysis
                             for "w_kinavg matrix". Default: compress kinetics results.'''
        )
        self.progress.add_args(parser)
        parser.set_defaults(compression=True)

    def process_args(self, args):
        self.progress.process_args(args)
        self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.output_file = h5io.WESTPAH5File(args.output,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments do not span the requested iterations')
        self.do_compression = args.compression
示例#2
0
    def __init__(self):
        super(WNetworker, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_filename = None
        self.tm_filename = None
        self.postprocess_function = None
示例#3
0
    def __init__(self, parent):
        super(KineticsSubcommands, self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_file = None
        self.assignments_file = None

        self.do_compression = True
示例#4
0
    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None
示例#5
0
class WESTKineticsBase(WESTSubcommand):
    '''
    Common argument processing for w_direct/w_reweight subcommands.
    Mostly limited to handling input and output from w_assign.
    '''
    
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file,
                            help='''Store results in OUTPUT (default: %(default)s).''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
示例#6
0
class WESTKineticsBase(WESTSubcommand):
    '''
    Common argument processing for w_direct/w_reweight subcommands.
    Mostly limited to handling input and output from w_assign.
    '''
    
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file,
                            help='''Store results in OUTPUT (default: %(default)s).''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
示例#7
0
    def __init__(self):
        super(WSelectTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False
示例#8
0
    def __init__(self):
        super(WNTopTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None
示例#9
0
    def __init__(self):
        super(WFluxanlTool, self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}

        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
示例#10
0
 def __init__(self):
     super(WPDist,self).__init__()
     
     # Parallel processing by default (this is not actually necessary, but it is
     # informative!)
     self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
     
     # These are used throughout
     self.progress = ProgressIndicatorComponent()
     self.data_reader = WESTDataReader()
     self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
     self.iter_range = IterRangeSelection(self.data_reader)
     self.iter_range.include_args['iter_step'] = False
     self.binspec = None
     self.output_filename = None
     self.output_file = None
     
     
     self.dsspec = None
     self.wt_dsspec = None # dsspec for weights
     
     # These are used during histogram generation only
     self.iter_start = None
     self.iter_stop = None
     self.ndim = None
     self.ntimepoints = None
     self.dset_dtype = None
     self.binbounds = None  # bin boundaries for each dimension
     self.midpoints = None  # bin midpoints for each dimension 
     self.data_range = None # data range for each dimension, as the pairs (min,max)
     self.ignore_out_of_range = False
     self.compress_output = False
示例#11
0
    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None
示例#12
0
    def __init__(self):
        super(StateProbTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None

        self.output_file = None
        self.assignments_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
示例#13
0
 def __init__(self, parent):
     super(KineticsSubcommands,self).__init__(parent)
     self.progress = ProgressIndicatorComponent()
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection() 
     self.output_file = None
     self.assignments_file = None
     
     self.do_compression = True
示例#14
0
    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None
示例#15
0
class KineticsSubcommands(WESTSubcommand):
    '''Base class for common options for both kinetics schemes'''
    
    def __init__(self, parent):
        super(KineticsSubcommands,self).__init__(parent)
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection() 
        self.output_file = None
        self.assignments_file = None
        
        self.do_compression = True

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        
        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                             help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                                (default: %(default)s).''')
        # default_kinetics_file will be picked up as a class attribute from the appropriate
        # subclass
        iogroup.add_argument('-o', '--output', dest='output', default=self.default_kinetics_file,
                             help='''Store results in OUTPUT (default: %(default)s).''')
        iogroup.add_argument('--no-compression', dest='compression', action='store_false',
                             help='''Do not store kinetics results compressed. This can increase disk
                             use about 100-fold, but can dramatically speed up subsequent analysis
                             for "w_kinavg matrix". Default: compress kinetics results.''')
        self.progress.add_args(parser)
        parser.set_defaults(compression=True)
        
    def process_args(self, args):
        self.progress.process_args(args)
        self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.output_file = h5io.WESTPAH5File(args.output, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments do not span the requested iterations')
        self.do_compression = args.compression
示例#16
0
    def __init__(self):
        super(WSelectTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False
示例#17
0
    def __init__(self):
        super(WNTopTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None
示例#18
0
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
示例#19
0
    def __init__(self):
        super(WFluxanlTool,self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}
        
        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
 def __init__(self):
     super(WPostAnalysisReweightTool, self).__init__()
     
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection()
     self.progress = ProgressIndicatorComponent()
     
     self.output_filename = None
     self.kinetics_filename = None
     self.assignment_filename = None
     
     self.output_file = None
     self.assignments_file = None
     self.kinetics_file = None
     
     self.evolution_mode = None
示例#21
0
 def __init__(self):
     super(StateProbTool,self).__init__()
     
     self.data_reader = WESTDataReader()
     self.iter_range = IterRangeSelection()
     self.progress = ProgressIndicatorComponent()
     
     self.output_filename = None
     self.kinetics_filename = None
     
     self.output_file = None
     self.assignments_file = None
     
     self.evolution_mode = None
     
     self.mcbs_alpha = None
     self.mcbs_acalpha = None
     self.mcbs_nsets = None
示例#22
0
    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True
示例#23
0
class WPostAnalysisReweightTool(WESTTool):
    prog = 'w_postanalysis_reweight'
    description = '''\
Calculate average rates from weighted ensemble data using the postanalysis
reweighting scheme. Bin assignments (usually "assignments.h5") and pre-calculated 
iteration flux matrices (usually "flux_matrices.h5") data files must have been 
previously generated using w_postanalysis_matrix.py (see "w_assign --help" and 
"w_kinetics --help" for information on generating these files).


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "kinrw.h5") contains the following
dataset:

  /state_prob_evolution [window,state]
    The reweighted state populations based on windows

  /color_prob_evolution [window,state]
    The reweighted populations last assigned to each state based on windows

  /bin_prob_evolution [window, bin]
    The reweighted populations of each bin based on windows. Bins contain
    one color each, so to recover the original un-colored spatial bins,
    one must sum over all states.

  /conditional_flux_evolution [window,state,state]
    (Structured -- see below). State-to-state fluxes based on windows of
    varying width
    
The structure of the final dataset is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        iogroup.add_argument(
            '-k',
            '--kinetics',
            default='flux_matrices.h5',
            help=
            '''Per-iteration flux matrices calculated by w_postanalysis_matrix 
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='kinrw.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked'],
            default='cumulative',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.''')
        cogroup.add_argument(
            '--window-frac',
            type=float,
            default=1.0,
            help=
            '''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.'''
        )

        cogroup.add_argument(
            '--obs-threshold',
            type=int,
            default=1,
            help=
            '''The minimum number of observed transitions between two states i and j necessary to include
                             fluxes in the reweighting estimate''')

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(
            self.kinetics_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError(
                'kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics

        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError(
                'Parameter error -- fractional window defined by --window-frac must be in (0,1]'
            )
        self.obs_threshold = args.obs_threshold

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.assignments_file.attrs['nstates']
            nbins = self.assignments_file.attrs['nbins']
            state_labels = self.assignments_file['state_labels'][...]
            state_map = self.assignments_file['state_map'][...]
            nfbins = self.kinetics_file.attrs['nrows']
            npts = self.kinetics_file.attrs['npts']

            assert nstates == len(state_labels)
            assert nfbins == nbins * nstates

            start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step

            start_pts = range(start_iter, stop_iter, step_iter)
            flux_evol = np.zeros((len(start_pts), nstates, nstates),
                                 dtype=ci_dtype)
            color_prob_evol = np.zeros((len(start_pts), nstates))
            state_prob_evol = np.zeros((len(start_pts), nstates))
            bin_prob_evol = np.zeros((len(start_pts), nfbins))
            pi.new_operation('Calculating flux evolution', len(start_pts))

            if self.evolution_mode == 'cumulative' and self.evol_window_frac == 1.0:
                print('Using fast streaming accumulation')

                total_fluxes = np.zeros((nfbins, nfbins), weight_dtype)
                total_obs = np.zeros((nfbins, nfbins), np.int64)

                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    stop = min(start + step_iter, stop_iter)

                    params = dict(start=start,
                                  stop=stop,
                                  nstates=nstates,
                                  nbins=nbins,
                                  state_labels=state_labels,
                                  state_map=state_map,
                                  nfbins=nfbins,
                                  total_fluxes=total_fluxes,
                                  total_obs=total_obs,
                                  h5file=self.kinetics_file,
                                  obs_threshold=self.obs_threshold)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(
                        **params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][
                                k, j] = rw_state_flux[k, j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k, j] = start
                            flux_evol[iblock]['iter_stop'][k, j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs

            else:
                for iblock, start in enumerate(start_pts):
                    pi.progress += 1

                    stop = min(start + step_iter, stop_iter)
                    if self.evolution_mode == 'cumulative':
                        windowsize = max(
                            1,
                            int(self.evol_window_frac * (stop - start_iter)))
                        block_start = max(start_iter, stop - windowsize)
                    else:  # self.evolution_mode == 'blocked'
                        block_start = start

                    params = dict(start=block_start,
                                  stop=stop,
                                  nstates=nstates,
                                  nbins=nbins,
                                  state_labels=state_labels,
                                  state_map=state_map,
                                  nfbins=nfbins,
                                  total_fluxes=None,
                                  total_obs=None,
                                  h5file=self.kinetics_file)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(
                        **params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][
                                k, j] = rw_state_flux[k, j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k, j] = start
                            flux_evol[iblock]['iter_stop'][k, j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs

            ds_flux_evol = self.output_file.create_dataset(
                'conditional_flux_evolution',
                data=flux_evol,
                shuffle=True,
                compression=9)
            ds_state_prob_evol = self.output_file.create_dataset(
                'state_prob_evolution', data=state_prob_evol, compression=9)
            ds_color_prob_evol = self.output_file.create_dataset(
                'color_prob_evolution', data=color_prob_evol, compression=9)
            ds_bin_prob_evol = self.output_file.create_dataset(
                'bin_prob_evolution', data=bin_prob_evol, compression=9)
            ds_state_labels = self.output_file.create_dataset(
                'state_labels', data=state_labels)
示例#24
0
class WFluxanlTool(WESTTool):
    prog = 'w_fluxanl'
    description = '''\
Extract fluxes into pre-defined target states from WEST data,
average, and construct confidence intervals. Monte Carlo bootstrapping
is used to account for the correlated and possibly non-Gaussian statistical
error in flux measurements.

All non-graphical output (including that to the terminal and HDF5) assumes that
the propagation/resampling period ``tau`` is equal to unity; to obtain results
in familiar units, divide all fluxes and multiply all correlation lengths by
the true value of ``tau``.
'''

    output_format_version = 2

    def __init__(self):
        super(WFluxanlTool, self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}

        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='fluxanl.h5',
            help=
            'Store intermediate data and analysis results to OUTPUT (default: %(default)s).'
        )
        cgroup = parser.add_argument_group('calculation options')
        cgroup.add_argument(
            '--disable-bootstrap',
            '-db',
            dest='bootstrap',
            action='store_const',
            const=False,
            help='''Enable the use of Monte Carlo Block Bootstrapping.''')
        cgroup.add_argument('--disable-correl',
                            '-dc',
                            dest='correl',
                            action='store_const',
                            const=False,
                            help='''Disable the correlation analysis.''')
        cgroup.add_argument(
            '-a',
            '--alpha',
            type=float,
            default=0.05,
            help=
            '''Calculate a (1-ALPHA) confidence interval on the average flux'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '-N',
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )
        cgroup.add_argument(
            '--evol',
            action='store_true',
            dest='do_evol',
            help=
            '''Calculate time evolution of flux confidence intervals (expensive).'''
        )
        cgroup.add_argument(
            '--evol-step',
            type=int,
            default=1,
            metavar='ESTEP',
            help=
            '''Calculate time evolution of flux confidence intervals every ESTEP
                            iterations (default: %(default)s)''')

    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.iter_range.data_manager = self.data_reader
        self.iter_range.process_args(args)

        self.output_h5file = h5py.File(args.output, 'w')

        self.alpha = args.alpha
        # Disable the bootstrap or the correlation analysis.
        self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True
        self.do_correl = args.correl if args.correl is not None else True
        self.autocorrel_alpha = args.acalpha or self.alpha
        self.n_sets = args.nsets or mclib.get_bssize(self.alpha)

        self.do_evol = args.do_evol
        self.evol_step = args.evol_step or 1

    def calc_store_flux_data(self):
        westpa.rc.pstatus(
            'Calculating mean flux and confidence intervals for iterations [{},{})'
            .format(self.iter_range.iter_start, self.iter_range.iter_stop))

        fluxdata = extract_fluxes(self.iter_range.iter_start,
                                  self.iter_range.iter_stop, self.data_reader)

        # Create a group to store data in
        output_group = h5io.create_hdf5_group(self.output_h5file,
                                              'target_flux',
                                              replace=False,
                                              creating_program=self.prog)
        self.output_group = output_group
        output_group.attrs['version_code'] = self.output_format_version
        self.iter_range.record_data_iter_range(output_group)

        n_targets = len(fluxdata)
        index = numpy.empty((len(fluxdata), ), dtype=target_index_dtype)
        avg_fluxdata = numpy.empty((n_targets, ), dtype=ci_dtype)

        for itarget, (target_label,
                      target_fluxdata) in enumerate(fluxdata.items()):
            # Create group and index entry
            index[itarget]['target_label'] = str(target_label)
            target_group = output_group.create_group(
                'target_{}'.format(itarget))

            self.target_groups[target_label] = target_group

            # Store per-iteration values
            target_group['n_iter'] = target_fluxdata['n_iter']
            target_group['count'] = target_fluxdata['count']
            target_group['flux'] = target_fluxdata['flux']
            h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1'])

            # Calculate flux autocorrelation
            fluxes = target_fluxdata['flux']
            mean_flux = fluxes.mean()
            fmm = fluxes - mean_flux
            acorr = fftconvolve(fmm, fmm[::-1])
            acorr = acorr[len(acorr) // 2:]
            acorr /= acorr[0]
            acorr_ds = target_group.create_dataset('flux_autocorrel',
                                                   data=acorr)
            h5io.label_axes(acorr_ds, ['lag'], ['tau'])

            # Calculate overall averages and CIs
            #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
            #                                                     autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean)
            avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl(
                {'dataset': fluxes},
                estimator=(lambda stride, dataset: numpy.mean(dataset)),
                alpha=self.alpha,
                n_sets=self.n_sets,
                autocorrel_alpha=self.autocorrel_alpha,
                subsample=numpy.mean,
                do_correl=self.do_correl,
                mcbs_enable=self.mcbs_enable)
            avg_fluxdata[itarget] = (self.iter_range.iter_start,
                                     self.iter_range.iter_stop, avg, lb_ci,
                                     ub_ci, sterr, correl_len)
            westpa.rc.pstatus('target {!r}:'.format(target_label))
            westpa.rc.pstatus(
                '  correlation length = {} tau'.format(correl_len))
            westpa.rc.pstatus(
                '  mean flux and CI   = {:e} ({:e},{:e}) tau^(-1)'.format(
                    avg, lb_ci, ub_ci))
            index[itarget]['mean_flux'] = avg
            index[itarget]['mean_flux_ci_lb'] = lb_ci
            index[itarget]['mean_flux_ci_ub'] = ub_ci
            index[itarget]['mean_flux_correl_len'] = correl_len

        # Write index and summary
        index_ds = output_group.create_dataset('index', data=index)
        index_ds.attrs['mcbs_alpha'] = self.alpha
        index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
        index_ds.attrs['mcbs_n_sets'] = self.n_sets

        self.fluxdata = fluxdata
        self.output_h5file['avg_flux'] = avg_fluxdata

    def calc_evol_flux(self):
        westpa.rc.pstatus(
            'Calculating cumulative evolution of flux confidence intervals every {} iteration(s)'
            .format(self.evol_step))

        for itarget, (target_label,
                      target_fluxdata) in enumerate(self.fluxdata.items()):
            fluxes = target_fluxdata['flux']
            target_group = self.target_groups[target_label]
            iter_start = target_group['n_iter'][0]
            iter_stop = target_group['n_iter'][-1]
            iter_count = iter_stop - iter_start
            n_blocks = iter_count // self.evol_step
            if iter_count % self.evol_step > 0: n_blocks += 1

            cis = numpy.empty((n_blocks, ), dtype=ci_dtype)

            for iblock in range(n_blocks):
                block_iter_stop = min(
                    iter_start + (iblock + 1) * self.evol_step, iter_stop)
                istop = min((iblock + 1) * self.evol_step,
                            len(target_fluxdata['flux']))
                fluxes = target_fluxdata['flux'][:istop]

                #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
                #                                                     autocorrel_alpha = self.autocorrel_alpha,
                #                                                     subsample=numpy.mean)
                avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl(
                    {'dataset': fluxes},
                    estimator=(lambda stride, dataset: numpy.mean(dataset)),
                    alpha=self.alpha,
                    n_sets=self.n_sets,
                    autocorrel_alpha=self.autocorrel_alpha,
                    subsample=numpy.mean,
                    do_correl=self.do_correl,
                    mcbs_enable=self.mcbs_enable)
                cis[iblock]['iter_start'] = iter_start
                cis[iblock]['iter_stop'] = block_iter_stop
                cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock][
                    'ci_ubound'] = avg, ci_lb, ci_ub
                cis[iblock]['corr_len'] = correl_len
                cis[iblock]['sterr'] = sterr

                del fluxes

            cis_ds = target_group.create_dataset('flux_evolution', data=cis)
            cis_ds.attrs['iter_step'] = self.evol_step
            cis_ds.attrs['mcbs_alpha'] = self.alpha
            cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
            cis_ds.attrs['mcbs_n_sets'] = self.n_sets

    def go(self):
        self.calc_store_flux_data()
        if self.do_evol:
            self.calc_evol_flux()
示例#25
0
class StateProbTool(WESTParallelTool):
    prog='w_stateprobs'
    description = '''\
Calculate average populations and associated errors in state populations from
weighted ensemble data. Bin assignments, including macrostate definitions,
are required. (See "w_assign --help" for more information).

-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "stateprobs.h5") contains the following
dataset:

  /avg_state_pops [state]
    (Structured -- see below) Population of each state across entire
    range specified.

If --evolution-mode is specified, then the following additional dataset is
available:

  /state_pop_evolution [window][state]
    (Structured -- see below). State populations based on windows of
    iterations of varying width.  If --evolution-mode=cumulative, then
    these windows all begin at the iteration specified with
    --start-iter and grow in length by --step-iter for each successive 
    element. If --evolution-mode=blocked, then these windows are all of
    width --step-iter (excluding the last, which may be shorter), the first
    of which begins at iteration --start-iter.
    
The structure of these datasets is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.
    
  ci_lbound
    (Floating-point) Lower bound of the confidence interval on the rate
    within this window, in units of inverse tau.
    
  ci_ubound
    (Floating-point) Upper bound of the confidence interval on the rate 
    within this window, in units of inverse tau.
    
  corr_len
    (Integer) Correlation length of the rate within this window, in units
    of tau.

Each of these datasets is also stamped with a number of attributes:

  mcbs_alpha
    (Floating-point) Alpha value of confidence intervals. (For example, 
    *alpha=0.05* corresponds to a 95% confidence interval.)

  mcbs_nsets
    (Integer) Number of bootstrap data sets used in generating confidence
    intervals.
    
  mcbs_acalpha
    (Floating-point) Alpha value for determining correlation lengths.
   

-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''    
    
    def __init__(self):
        super(StateProbTool,self).__init__()
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        
        self.output_file = None
        self.assignments_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
        
    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='stateprobs.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        
        cgroup = parser.add_argument_group('confidence interval calculation options')
        cgroup.add_argument('--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        
        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked', 'none'], default='none',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

    
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(self.mcbs_alpha)
        
        self.evolution_mode = args.evolution_mode
        
    def calc_state_pops(self):
        start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop
        nstates = self.nstates
        state_map = self.state_map
        iter_count = stop_iter-start_iter
        
        pi = self.progress.indicator
        pi.new_operation('Calculating state populations')
        pops = h5io.IterBlockedDataset(self.assignments_file['labeled_populations'])
        
        iter_state_pops = numpy.empty((nstates+1,), weight_dtype)
        all_state_pops = numpy.empty((iter_count,nstates+1), weight_dtype)
        avg_state_pops = numpy.zeros((nstates+1,), weight_dtype)
        pops.cache_data(max_size='available')
        try:
            for iiter,n_iter in enumerate(xrange(start_iter,stop_iter)):
                iter_state_pops.fill(0)
                labeled_pops = pops.iter_entry(n_iter)
                accumulate_state_populations_from_labeled(labeled_pops, state_map, iter_state_pops, check_state_map=False)
                all_state_pops[iiter] = iter_state_pops
                avg_state_pops += iter_state_pops
                del labeled_pops
                pi.progress += 1
        finally:
            pops.drop_cache()
        self.output_file.create_dataset('state_pops', data=all_state_pops, compression=9, shuffle=True)
        h5io.stamp_iter_range(self.output_file['state_pops'], start_iter, stop_iter)
        
        self.all_state_pops = all_state_pops
        avg_state_pops = numpy.zeros((nstates+1,), ci_dtype)
        pi.new_operation('Calculating overall average populations and CIs', nstates)
#        futures = []
#         for istate in xrange(nstates):
#             futures.append(self.work_manager.submit(_eval_block,kwargs=dict(iblock=None,istate=istate,
#                                                                             start=start_iter,stop=stop_iter,
#                                                                             state_pops=all_state_pops[:,istate],
#                                                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
#                                                                             mcbs_acalpha = self.mcbs_acalpha)))
#         for future in self.work_manager.as_completed(futures):
        def taskgen():
            for istate in xrange(nstates):
                yield (_eval_block, (), dict(iblock=None,istate=istate,
                                             start=start_iter,stop=stop_iter,
                                             state_pops=all_state_pops[:,istate],
                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
                                             mcbs_acalpha = self.mcbs_acalpha))
        for future in self.work_manager.submit_as_completed(taskgen(), self.max_queue_len):
            (_iblock,istate,ci_res) = future.get_result(discard=True)
            avg_state_pops[istate] = ci_res
            pi.progress += 1
        self.output_file['avg_state_pops'] = avg_state_pops
        self.stamp_mcbs_info(self.output_file['avg_state_pops'])
        pi.clear()
        
        maxlabellen = max(map(len,self.state_labels))
        print('average state populations:')
        for istate in xrange(nstates):
            print('{:{maxlabellen}s}: mean={:21.15e} CI=({:21.15e}, {:21.15e})'
                  .format(self.state_labels[istate],
                          avg_state_pops['expected'][istate],
                          avg_state_pops['ci_lbound'][istate],
                          avg_state_pops['ci_ubound'][istate],
                          maxlabellen=maxlabellen))
        
    def calc_evolution(self):
        nstates = self.nstates
        start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step
        start_pts = range(start_iter, stop_iter, step_iter)

        pop_evol = numpy.zeros((len(start_pts), nstates), dtype=ci_dtype)

        pi = self.progress.indicator
        pi.new_operation('Calculating population evolution', len(start_pts)*nstates)
#         futures = []
#         for iblock, start in enumerate(start_pts):
#             if self.evolution_mode == 'cumulative':
#                 block_start = start_iter
#             else: # self.evolution_mode == 'blocked'
#                 block_start = start
#             stop = min(start+step_iter, stop_iter)
# 
#             for istate in xrange(nstates):
#                 future = self.work_manager.submit(_eval_block,kwargs=dict(iblock=iblock,istate=istate,
#                                                                           start=block_start,stop=stop,
#                                                                           state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
#                                                                           mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
#                                                                           mcbs_acalpha = self.mcbs_acalpha))
#                 futures.append(future)
        def taskgen():
            for iblock, start in enumerate(start_pts):
                if self.evolution_mode == 'cumulative':
                    block_start = start_iter
                else: # self.evolution_mode == 'blocked'
                    block_start = start
                stop = min(start+step_iter, stop_iter)
     
                for istate in xrange(nstates):
                    yield (_eval_block,(),dict(iblock=iblock,istate=istate,
                                               start=block_start,stop=stop,
                                               state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
                                               mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
                                               mcbs_acalpha = self.mcbs_acalpha))
        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(taskgen(), self.max_queue_len):
            (iblock,istate,ci_res) = future.get_result(discard=True)
            pop_evol[iblock,istate] =  ci_res
            pi.progress += 1

        self.output_file.create_dataset('state_pop_evolution', data=pop_evol, shuffle=True, compression=9)
        pi.clear()

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.nstates = self.assignments_file.attrs['nstates']

            state_labels = self.state_labels = self.assignments_file['state_labels'][...]
            state_map = self.state_map = self.assignments_file['state_map'][...]
            if (state_map > nstates).any():
                raise ValueError('invalid state mapping')

            # copy metadata to output
            self.output_file.attrs['nstates'] = nstates
            self.output_file['state_labels'] = state_labels

            # calculate overall averages
            self.calc_state_pops()

            # calculate evolution, if requested
            if self.evolution_mode != 'none' and self.iter_range.iter_step:
                self.calc_evolution()
示例#26
0
class WCrawl(WESTParallelTool):
    prog='w_crawl'
    description = '''\
Crawl a weighted ensemble dataset, executing a function for each iteration.
This can be used for postprocessing of trajectories, cleanup of datasets,
or anything else that can be expressed as "do X for iteration N, then do
something with the result". Tasks are parallelized by iteration, and 
no guarantees are made about evaluation order.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        tgroup = parser.add_argument_group('task options')
        tgroup.add_argument('-c', '--crawler-instance',
                            help='''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of
                            WESTPACrawler to coordinate the calculation. Required only if initialization,
                            finalization, or task result processing is required.''')
        tgroup.add_argument('task_callable',
                            help='''Run TASK_CALLABLE (specified as module.function) on each iteration.
                            Required.''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        self.task_callable = get_object(args.task_callable, path=['.'])
        if args.crawler_instance is not None:
            self.crawler = get_object(args.crawler_instance, path=['.'])
        else:
            self.crawler = WESTPACrawler()

    def go(self):
        iter_start = self.iter_range.iter_start
        iter_stop = self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        self.data_reader.open('r')
        pi = self.progress.indicator
        with pi:
            pi.operation = 'Initializing'
            self.crawler.initialize(iter_start, iter_stop)

            try:
                pi.new_operation('Dispatching tasks & processing results', iter_count)
                task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in xrange(iter_start,iter_stop))
                for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
                    n_iter, result = future.get_result(discard=True)
                    if self.crawler is not None:
                        self.crawler.process_iter_result(n_iter,result)
                    pi.progress += 1
            finally:
                pi.new_operation('Finalizing')
                self.crawler.finalize()
示例#27
0
class WPDist(WESTParallelTool):
    prog = 'w_pdist'
    description = '''\
Calculate time-resolved, multi-dimensional probability distributions of WE 
datasets.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.
    
  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.
    
  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``numpy`` available for
    use [e.g. to specify bins using numpy.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.
    
  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.
    
  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.
    
  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WPDist, self).__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None

        self.dsspec = None
        self.wt_dsspec = None  # dsspec for weights

        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension
        self.data_range = None  # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)

        self.iter_range.add_args(parser)

        parser.add_argument(
            '-b',
            '--bins',
            dest='bins',
            metavar='BINEXPR',
            default='100',
            help=
            '''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)'''
        )

        parser.add_argument(
            '-o',
            '--output',
            dest='output',
            default='pdist.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')
        parser.add_argument(
            '-C',
            '--compress',
            action='store_true',
            help=
            '''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''')

        parser.add_argument(
            '--loose',
            dest='ignore_out_of_range',
            action='store_true',
            help=
            '''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''')

        igroup = parser.add_argument_group(
            'input dataset options').add_mutually_exclusive_group(
                required=False)

        igroup.add_argument(
            '--construct-dataset',
            help=
            '''Use the given function (as in module.function) to extract source data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id][timepoint][dimension]''')

        igroup.add_argument(
            '--dsspecs',
            nargs='+',
            metavar='DSSPEC',
            help=
            '''Construct probability distribution from one or more DSSPECs.''')

        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        self.input_dssynth.h5filename = self.data_reader.we_h5filename
        self.input_dssynth.process_args(args)
        self.dsspec = self.input_dssynth.dsspec

        # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library
        # Open the WEST HDF5 file just long enough to process our iteration range, then close
        # and reopen in go() [which executes after the fork]
        with self.data_reader:
            self.iter_range.process_args(args)

        self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename,
                                          'seg_index',
                                          slice=numpy.index_exp['weight'])

        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False

    def go(self):
        self.data_reader.open('r')
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)

            self.iter_start = self.iter_range.iter_start
            self.iter_stop = self.iter_range.iter_stop

            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(
                    zip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints

            # construct histogram
            self.construct_histogram()

            # Record iteration range
            iter_range = self.iter_range.iter_range()
            self.output_file['n_iter'] = iter_range
            self.iter_range.record_data_iter_range(
                self.output_file['histograms'])

            self.output_file.close()

    @staticmethod
    def parse_binspec(binspec):
        namespace = {'numpy': numpy, 'inf': float('inf')}

        try:
            binspec_compiled = eval(binspec, namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled

    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:
        
          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)
          
        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to 
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''

        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))

    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.dsspec.get_iter_data(self.iter_start)
            self.ntimepoints = dset.shape[1]
            self.ndim = dset.shape[2]
            self.dset_dtype = dset.dtype

    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''

        self.progress.indicator.new_operation('Scanning for data range',
                                              self.iter_stop - self.iter_start)
        self.scan_data_shape()

        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.dsspec

        try:
            minval = numpy.finfo(dset_dtype).min
            maxval = numpy.finfo(dset_dtype).max
        except ValueError:
            minval = numpy.iinfo(dset_dtype).min
            maxval = numpy.iinfo(dset_dtype).max

        data_range = self.data_range = [(maxval, minval)
                                        for _i in range(self.ndim)]

        #futures = []
        #for n_iter in xrange(self.iter_start, self.iter_stop):
        #_remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
            ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {})
             for n_iter in range(self.iter_start, self.iter_stop)),
                self.max_queue_len):
            bounds = future.get_result(discard=True)
            for idim in range(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = numpy.linspace(lb, ub, bins + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = numpy.linspace(lb, ub, bins[idim] + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = numpy.asarray(boundset)
            if (numpy.diff(boundset) <= 0).any():
                raise ValueError(
                    'boundary set {!r} is not strictly monotonically increasing'
                    .format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0)

    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''

        self.scan_data_shape()

        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset(
            'histograms',
            dtype=numpy.float64,
            shape=((iter_count, ) +
                   tuple(len(bounds) - 1 for bounds in self.binbounds)),
            compression=9 if self.compress_output else None)
        binbounds = [
            numpy.require(boundset, self.dset_dtype, 'C')
            for boundset in self.binbounds
        ]

        self.progress.indicator.new_operation('Constructing histograms',
                                              self.iter_stop - self.iter_start)
        task_gen = (
            (_remote_bin_iter,
             (iiter, n_iter, self.dsspec, self.wt_dsspec,
              1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {})
            for (iiter,
                 n_iter) in enumerate(range(self.iter_start, self.iter_stop)))
        #futures = set()
        #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))

        #for future in self.work_manager.as_completed(futures):
        #future = self.work_manager.wait_any(futures)
        #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(
                task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future
示例#28
0
class WNTopTool(WESTTool):
    prog = 'w_ntop'
    description = '''\
Select walkers from bins . An assignment file mapping walkers to
bins at each timepoint is required (see``w_assign --help`` for further 
information on generating this file). By default, high-weight walkers are
selected (hence the name ``w_ntop``: select the N top-weighted walkers from
each bin); however, minimum weight walkers and randomly-selected walkers
may be selected instead.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "ntop.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration][bin]
    *(Integer)* Number of segments in each bin/state in the given iteration.
    This will generally be the same as the number requested with
    ``--n/--count`` but may be smaller if the requested number of walkers
    does not exist.

  ``/seg_ids`` [iteration][bin][segment]
    *(Integer)* Matching segments in each iteration for each bin.
    For an iteration ``n_iter``, only the first ``n_iter`` entries are
    valid. For example, the full list of matching seg_ids in bin 0 in the 
    first stored iteration is ``seg_ids[0][0][:n_segs[0]]``.

  ``/weights`` [iteration][bin][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WNTopTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        igroup = parser.add_argument_group('input options')
        igroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help=
            '''Use assignments from the given ASSIGNMENTS file (default: %(default)s).'''
        )

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-n',
            '--count',
            type=int,
            default=1,
            help=
            '''Select COUNT walkers from each iteration for each bin (default: %(default)s).'''
        )
        sgroup.add_argument(
            '-t',
            '--timepoint',
            type=int,
            default=-1,
            help=
            '''Base selection on the given TIMEPOINT within each iteration. Default (-1)
                            corresponds to the last timepoint.''')
        cgroup = parser.add_mutually_exclusive_group()
        cgroup.add_argument(
            '--highweight',
            dest='select_what',
            action='store_const',
            const='highweight',
            help='''Select COUNT highest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--lowweight',
            dest='select_what',
            action='store_const',
            const='lowweight',
            help='''Select COUNT lowest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--random',
            dest='select_what',
            action='store_const',
            const='random',
            help='''Select COUNT walkers randomly from each bin.''')
        parser.set_defaults(select_what='highweight')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='ntop.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.what = args.select_what
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.count = args.count
        self.timepoint = args.timepoint

    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins'] + 1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(
            assignments_file['nsegs'], iter_start, iter_stop)]

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))

        seg_count_ds = output_file.create_dataset('nsegs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, nbins))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, nbins, count),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, nbins, count),
                                       seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, nbins,
                                                       count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, nbins, count),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                assignments = numpy.require(assignments_ds[
                    h5io.get_iteration_entry(assignments_ds, n_iter) +
                    numpy.index_exp[:, timepoint]],
                                            dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(
                    n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(
                    nsegs[iiter], nbins, assignments)
                for ibin in range(nbins):
                    segs = numpy.nonzero(segs_by_bin[:, ibin])[0]

                    seg_count_ds[iiter, ibin] = min(len(segs), count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = numpy.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = numpy.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = numpy.random.permutation(len(weights))

                        matching_segs_ds[iiter,
                                         ibin, :len(segs)] = segs.take(indices)
                        weights_ds[iiter,
                                   ibin, :len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
示例#29
0
class WSelectTool(WESTParallelTool):
    prog='w_select'
    description = '''\
Select dynamics segments matching various criteria. This requires a
user-provided prediate function. By default, only matching segments are
stored. If the -a/--include-ancestors option is given, then matching segments
and their ancestors will be stored.


-----------------------------------------------------------------------------
Predicate function
-----------------------------------------------------------------------------

Segments are selected based on a predicate function, which must be callable
as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs
matching the predicate in that iteration.

The predicate may be inverted by specifying the -v/--invert command-line
argument.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "select.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration]
    *(Integer)* Number of segment IDs matching the predicate (or inverted
    predicate, if -v/--invert is specified) in the given iteration.

  ``/seg_ids`` [iteration][segment]
    *(Integer)* Matching segments in each iteration. For an iteration
    ``n_iter``, only the first ``n_iter`` entries are valid. For example,
    the full list of matching seg_ids in the first stored iteration is
    ``seg_ids[0][:n_segs[0]]``.

  ``/weights`` [iteration][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WSelectTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument('-p', '--predicate-function', metavar='MODULE.FUNCTION',
                             help='''Use the given predicate function to match segments. This function
                             should take an iteration number and the HDF5 group corresponding to that
                             iteration and return a sequence of seg_ids matching the predicate, as in
                             ``match_predicate(n_iter, iter_group)``.''')
        sgroup.add_argument('-v', '--invert', dest='invert', action='store_true',
                            help='''Invert the match predicate.''')
        sgroup.add_argument('-a', '--include-ancestors', action ='store_true',
                            help='''Include ancestors of matched segments in output.''')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='select.h5',
                            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        predicate = get_object(args.predicate_function,path=['.'])
        if not callable(predicate):
            raise TypeError('predicate object {!r} is not callable'.format(predicate))
        self.predicate = predicate
        self.invert = bool(args.invert)
        self.include_ancestors = bool(args.include_ancestors)
        self.output_filename = args.output

    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs', dtype=numpy.uint, shape=(iter_count,))
        matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,0), maxshape=(iter_count,None),
                                                      dtype=seg_id_dtype,
                                                      chunks=h5io.calc_chunksize((iter_count,1000000), seg_id_dtype),
                                                      shuffle=True, compression=9)
        weights_ds = output_file.create_dataset('weights', shape=(iter_count,0), maxshape=(iter_count,None),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize((iter_count,1000000), weight_dtype),
                                                shuffle=True,compression=9)

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
#             futures = set()
#             for n_iter in xrange(iter_start,iter_stop):
#                 futures.add(self.work_manager.submit(_find_matching_segments, 
#                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

#             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(((_find_matching_segments,
                                                                  (self.data_reader.we_h5filename,n_iter,self.predicate,self.invert),
                                                                  {}) for n_iter in xrange(iter_start,iter_stop)),
                                                                self.max_queue_len):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count,n_matches))
                        weights_ds.resize((iter_count,n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter-iter_start] = n_matches
                    matching_segs_ds[n_iter-iter_start,:n_matches] = matching_ids
                    weights_ds[n_iter-iter_start,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments', extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in xrange(iter_stop-1, iter_start-1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count,n_matches))
                        weights_ds.resize((iter_count,n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter,:n_matches] = matching_ids
                        weights_ds[iiter,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)['seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
示例#30
0
class WPDist(WESTParallelTool):
    prog='w_pdist'
    description = '''\
Calculate time-resolved, multi-dimensional probability distributions of WE 
datasets.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.
    
  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.
    
  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``numpy`` available for
    use [e.g. to specify bins using numpy.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.
    
  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.
    
  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.
    
  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.

    
-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''
    
    def __init__(self):
        super(WPDist,self).__init__()
        
        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        
        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None
        
        
        self.dsspec = None
        self.wt_dsspec = None # dsspec for weights
        
        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension 
        self.data_range = None # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False
        
    
    def add_args(self, parser):
        self.data_reader.add_args(parser)
         
        self.iter_range.add_args(parser)
                
        parser.add_argument('-b', '--bins', dest='bins', metavar='BINEXPR', default='100',
                            help='''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)''')
        
        parser.add_argument('-o', '--output', dest='output', default='pdist.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')
        parser.add_argument('-C', '--compress', action='store_true', 
                            help='''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''')
        
        parser.add_argument('--loose', dest='ignore_out_of_range', action='store_true',
                            help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''')
        
        igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False)

        igroup.add_argument('--construct-dataset',
                            help='''Use the given function (as in module.function) to extract source data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id][timepoint][dimension]''')
        
        igroup.add_argument('--dsspecs', nargs='+', metavar='DSSPEC',
                            help='''Construct probability distribution from one or more DSSPECs.''')
        
        self.progress.add_args(parser)
        
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        self.input_dssynth.h5filename = self.data_reader.we_h5filename
        self.input_dssynth.process_args(args)
        self.dsspec = self.input_dssynth.dsspec
        
        # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library
        # Open the WEST HDF5 file just long enough to process our iteration range, then close
        # and reopen in go() [which executes after the fork]
        with self.data_reader:
            self.iter_range.process_args(args)
        
        self.wt_dsspec = SingleIterDSSpec(self.data_reader.we_h5filename, 'seg_index', slice=numpy.index_exp['weight'])
        
        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False
        
    
    def go(self):
        self.data_reader.open('r')
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)
            
            self.iter_start = self.iter_range.iter_start
            self.iter_stop = self.iter_range.iter_stop
    
            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(izip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints
    
            # construct histogram
            self.construct_histogram()
    
            # Record iteration range        
            iter_range = self.iter_range.iter_range()
            self.output_file['n_iter'] = iter_range
            self.iter_range.record_data_iter_range(self.output_file['histograms'])
            
            self.output_file.close()

    @staticmethod    
    def parse_binspec(binspec):
        namespace = {'numpy': numpy,
                     'inf': float('inf')}
                     
        try:
            binspec_compiled = eval(binspec,namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled
    
        
    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:
        
          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)
          
        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to 
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''
        
        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)
            
        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))
            
    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.dsspec.get_iter_data(self.iter_start)
            self.ntimepoints = dset.shape[1]
            self.ndim = dset.shape[2]
            self.dset_dtype = dset.dtype
        
            
    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''
        
        self.progress.indicator.new_operation('Scanning for data range', self.iter_stop-self.iter_start)
        self.scan_data_shape()
        
                
        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.dsspec
        
        try:
            minval = numpy.finfo(dset_dtype).min
            maxval = numpy.finfo(dset_dtype).max
        except ValueError:
            minval = numpy.iinfo(dset_dtype).min
            maxval = numpy.iinfo(dset_dtype).max
        
        data_range = self.data_range = [(maxval,minval) for _i in xrange(self.ndim)]

        #futures = []
        #for n_iter in xrange(self.iter_start, self.iter_stop):
            #_remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))
        
        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {})
                                                             for n_iter in xrange(self.iter_start, self.iter_stop)),
                                                            self.max_queue_len):
            bounds = future.get_result(discard=True)
            for idim in xrange(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()        

        self.binbounds = []
        self.midpoints = []        
        for idim in xrange(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch 
            # the maximum in the histogram
            ub *= 1.01
            
            boundset = numpy.linspace(lb,ub,bins+1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)
            
    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()        

        self.binbounds = []
        self.midpoints = []        
        for idim in xrange(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch 
            # the maximum in the histogram
            ub *= 1.01
            
            boundset = numpy.linspace(lb,ub,bins[idim]+1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)
               
    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = numpy.asarray(boundset)
            if (numpy.diff(boundset) <= 0).any():
                raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1]+boundset[1:])/2.0)
            
    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''
        
        self.scan_data_shape()
        
        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset('histograms', dtype=numpy.float64,
                                                        shape=((iter_count,) + tuple(len(bounds)-1 for bounds in self.binbounds)),
                                                        compression=9 if self.compress_output else None)
        binbounds = [numpy.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds]
        
        self.progress.indicator.new_operation('Constructing histograms',self.iter_stop-self.iter_start)
        task_gen = ((_remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds,
                                        self.ignore_out_of_range), {}) 
                    for (iiter,n_iter) in enumerate(xrange(self.iter_start, self.iter_stop)))
        #futures = set()
        #for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))
        
        #for future in self.work_manager.as_completed(futures):
            #future = self.work_manager.wait_any(futures)
        #for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future
示例#31
0
class WFluxanlTool(WESTTool):
    prog='w_fluxanl'
    description = '''\
Extract fluxes into pre-defined target states from WEST data,
average, and construct confidence intervals. Monte Carlo bootstrapping
is used to account for the correlated and possibly non-Gaussian statistical
error in flux measurements.

All non-graphical output (including that to the terminal and HDF5) assumes that
the propagation/resampling period ``tau`` is equal to unity; to obtain results
in familiar units, divide all fluxes and multiply all correlation lengths by
the true value of ``tau``.
'''
    
    output_format_version = 2

    def __init__(self):
        super(WFluxanlTool,self).__init__()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.output_h5file = None
        self.output_group = None
        self.target_groups = {}

        self.fluxdata = {}
        
        self.alpha = None
        self.autocorrel_alpha = None
        self.n_sets = None
        self.do_evol = False
        self.evol_step = 1
        
    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='fluxanl.h5',
                            help='Store intermediate data and analysis results to OUTPUT (default: %(default)s).')
        cgroup = parser.add_argument_group('calculation options')
        cgroup.add_argument('--disable-bootstrap', '-db', dest='bootstrap', action='store_const', const=False,
                             help='''Enable the use of Monte Carlo Block Bootstrapping.''')
        cgroup.add_argument('--disable-correl', '-dc', dest='correl', action='store_const', const=False,
                             help='''Disable the correlation analysis.''')
        cgroup.add_argument('-a', '--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval on the average flux'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('-N', '--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        cgroup.add_argument('--evol', action='store_true', dest='do_evol',
                            help='''Calculate time evolution of flux confidence intervals (expensive).''')
        cgroup.add_argument('--evol-step', type=int, default=1, metavar='ESTEP',
                            help='''Calculate time evolution of flux confidence intervals every ESTEP
                            iterations (default: %(default)s)''')
        
        
    def process_args(self, args):
        self.data_reader.process_args(args)
        self.data_reader.open()
        self.iter_range.data_manager = self.data_reader
        self.iter_range.process_args(args)
        
        self.output_h5file = h5py.File(args.output, 'w')
        
        self.alpha = args.alpha
        # Disable the bootstrap or the correlation analysis.
        self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True
        self.do_correl = args.correl if args.correl is not None else True
        self.autocorrel_alpha = args.acalpha or self.alpha
        self.n_sets = args.nsets or mclib.get_bssize(self.alpha)
        
        self.do_evol = args.do_evol
        self.evol_step = args.evol_step or 1
                
    def calc_store_flux_data(self):         
        westpa.rc.pstatus('Calculating mean flux and confidence intervals for iterations [{},{})'
                        .format(self.iter_range.iter_start, self.iter_range.iter_stop))
        
        fluxdata = extract_fluxes(self.iter_range.iter_start, self.iter_range.iter_stop, self.data_reader)
        
        # Create a group to store data in
        output_group = h5io.create_hdf5_group(self.output_h5file, 'target_flux', replace=False, creating_program=self.prog)        
        self.output_group = output_group
        output_group.attrs['version_code'] = self.output_format_version
        self.iter_range.record_data_iter_range(output_group)
        
        n_targets = len(fluxdata)
        index = numpy.empty((len(fluxdata),), dtype=target_index_dtype)
        avg_fluxdata = numpy.empty((n_targets,), dtype=ci_dtype)
        

        for itarget, (target_label, target_fluxdata) in enumerate(fluxdata.iteritems()):
            # Create group and index entry
            index[itarget]['target_label'] = str(target_label)
            target_group = output_group.create_group('target_{}'.format(itarget))

            self.target_groups[target_label] = target_group
            
            # Store per-iteration values
            target_group['n_iter'] = target_fluxdata['n_iter']
            target_group['count'] = target_fluxdata['count']
            target_group['flux'] = target_fluxdata['flux']
            h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1'])
            
            
            # Calculate flux autocorrelation
            fluxes = target_fluxdata['flux']
            mean_flux = fluxes.mean()
            fmm = fluxes - mean_flux
            acorr = fftconvolve(fmm,fmm[::-1])
            acorr = acorr[len(acorr)//2:]
            acorr /= acorr[0]
            acorr_ds = target_group.create_dataset('flux_autocorrel', data=acorr)
            h5io.label_axes(acorr_ds, ['lag'], ['tau'])
            
            # Calculate overall averages and CIs
            #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
            #                                                     autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean)
            avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl({'dataset': fluxes}, estimator=(lambda stride, dataset: numpy.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets,
                                                                 autocorrel_alpha=self.autocorrel_alpha, subsample=numpy.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable )
            avg_fluxdata[itarget] = (self.iter_range.iter_start, self.iter_range.iter_stop, avg, lb_ci, ub_ci, sterr, correl_len)
            westpa.rc.pstatus('target {!r}:'.format(target_label))
            westpa.rc.pstatus('  correlation length = {} tau'.format(correl_len))
            westpa.rc.pstatus('  mean flux and CI   = {:e} ({:e},{:e}) tau^(-1)'.format(avg,lb_ci,ub_ci))
            index[itarget]['mean_flux'] = avg
            index[itarget]['mean_flux_ci_lb'] = lb_ci
            index[itarget]['mean_flux_ci_ub'] = ub_ci
            index[itarget]['mean_flux_correl_len'] = correl_len

        # Write index and summary        
        index_ds = output_group.create_dataset('index', data=index)
        index_ds.attrs['mcbs_alpha'] = self.alpha
        index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
        index_ds.attrs['mcbs_n_sets'] = self.n_sets
        
        self.fluxdata = fluxdata
        self.output_h5file['avg_flux'] = avg_fluxdata
        
        
         
    def calc_evol_flux(self):
        westpa.rc.pstatus('Calculating cumulative evolution of flux confidence intervals every {} iteration(s)'
                        .format(self.evol_step))
        
        for itarget, (target_label, target_fluxdata) in enumerate(self.fluxdata.iteritems()):
            fluxes = target_fluxdata['flux']
            target_group = self.target_groups[target_label]
            iter_start = target_group['n_iter'][0]
            iter_stop  = target_group['n_iter'][-1]
            iter_count = iter_stop - iter_start
            n_blocks = iter_count // self.evol_step
            if iter_count % self.evol_step > 0: n_blocks += 1
            
            cis = numpy.empty((n_blocks,), dtype=ci_dtype)
            
            for iblock in xrange(n_blocks):
                block_iter_stop = min(iter_start + (iblock+1)*self.evol_step, iter_stop)
                istop = min((iblock+1)*self.evol_step, len(target_fluxdata['flux']))
                fluxes = target_fluxdata['flux'][:istop]
                
                #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, numpy.mean, self.alpha, self.n_sets,
                #                                                     autocorrel_alpha = self.autocorrel_alpha,
                #                                                     subsample=numpy.mean)
                avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl({'dataset': fluxes}, estimator=(lambda stride, dataset: numpy.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets,
                                                                     autocorrel_alpha = self.autocorrel_alpha,
                                                                     subsample=numpy.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable )
                cis[iblock]['iter_start'] = iter_start
                cis[iblock]['iter_stop']  = block_iter_stop
                cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock]['ci_ubound'] = avg, ci_lb, ci_ub
                cis[iblock]['corr_len'] = correl_len
                cis[iblock]['sterr'] = sterr
                
                del fluxes

            cis_ds = target_group.create_dataset('flux_evolution', data=cis)
            cis_ds.attrs['iter_step'] = self.evol_step
            cis_ds.attrs['mcbs_alpha'] = self.alpha
            cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha
            cis_ds.attrs['mcbs_n_sets'] = self.n_sets

        
    def go(self):
        self.calc_store_flux_data()
        if self.do_evol:
            self.calc_evol_flux()
示例#32
0
class WNTopTool(WESTTool):
    prog='w_ntop'
    description = '''\
Select walkers from bins . An assignment file mapping walkers to
bins at each timepoint is required (see``w_assign --help`` for further 
information on generating this file). By default, high-weight walkers are
selected (hence the name ``w_ntop``: select the N top-weighted walkers from
each bin); however, minimum weight walkers and randomly-selected walkers
may be selected instead.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "ntop.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration][bin]
    *(Integer)* Number of segments in each bin/state in the given iteration.
    This will generally be the same as the number requested with
    ``--n/--count`` but may be smaller if the requested number of walkers
    does not exist.

  ``/seg_ids`` [iteration][bin][segment]
    *(Integer)* Matching segments in each iteration for each bin.
    For an iteration ``n_iter``, only the first ``n_iter`` entries are
    valid. For example, the full list of matching seg_ids in bin 0 in the 
    first stored iteration is ``seg_ids[0][0][:n_segs[0]]``.

  ``/weights`` [iteration][bin][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WNTopTool,self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)
        
        igroup = parser.add_argument_group('input options')
        igroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Use assignments from the given ASSIGNMENTS file (default: %(default)s).''')

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument('-n', '--count', type=int, default=1,
                            help='''Select COUNT walkers from each iteration for each bin (default: %(default)s).''')
        sgroup.add_argument('-t', '--timepoint', type=int, default=-1,
                            help='''Base selection on the given TIMEPOINT within each iteration. Default (-1)
                            corresponds to the last timepoint.''')
        cgroup = parser.add_mutually_exclusive_group()
        cgroup.add_argument('--highweight', dest='select_what', action='store_const', const='highweight',
                            help='''Select COUNT highest-weight walkers from each bin.''')
        cgroup.add_argument('--lowweight', dest='select_what', action='store_const', const='lowweight',
                            help='''Select COUNT lowest-weight walkers from each bin.''')
        cgroup.add_argument('--random', dest='select_what', action='store_const', const='random',
                            help='''Select COUNT walkers randomly from each bin.''')
        parser.set_defaults(select_what='highweight')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument('-o', '--output', default='ntop.h5',
                            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.what = args.select_what
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.count = args.count
        self.timepoint = args.timepoint

    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins']+1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(assignments_file['nsegs'], iter_start,iter_stop)]

        output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop))

        seg_count_ds = output_file.create_dataset('nsegs', dtype=numpy.uint, shape=(iter_count,nbins))
        matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,nbins,count),
                                                      dtype=seg_id_dtype,
                                                      chunks=h5io.calc_chunksize((iter_count,nbins,count), seg_id_dtype),
                                                      shuffle=True, compression=9)
        weights_ds = output_file.create_dataset('weights', shape=(iter_count,nbins,count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize((iter_count,nbins,count), weight_dtype),
                                                shuffle=True,compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(xrange(iter_start, iter_stop)):
                assignments = numpy.require(assignments_ds[h5io.get_iteration_entry(assignments_ds, n_iter)
                                                           + numpy.index_exp[:,timepoint]], dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(nsegs[iiter],nbins,assignments)
                for ibin in xrange(nbins):
                    segs = numpy.nonzero(segs_by_bin[:,ibin])[0]

                    seg_count_ds[iiter,ibin] = min(len(segs),count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = numpy.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = numpy.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = numpy.random.permutation(len(weights))

                        matching_segs_ds[iiter,ibin,:len(segs)] = segs.take(indices)
                        weights_ds[iiter,ibin,:len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
示例#33
0
class KinAvgSubcommands(WESTSubcommand):
    '''Common argument processing for w_kinavg subcommands'''
    def __init__(self, parent):
        super(KinAvgSubcommands, self).__init__(parent)

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        # self.default_kinetics_file will be picked up as a class attribute from the appropriate subclass
        iogroup.add_argument(
            '-k',
            '--kinetics',
            default=self.default_kinetics_file,
            help='''Populations and transition rates are stored in KINETICS
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='kinavg.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cgroup = parser.add_argument_group(
            'confidence interval calculation options')
        cgroup.add_argument('--alpha',
                            type=float,
                            default=0.05,
                            help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked', 'none'],
            default='none',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.'''
        )
        cogroup.add_argument(
            '--window-frac',
            type=float,
            default=1.0,
            help=
            '''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.'''
        )

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(
            self.kinetics_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError(
                'kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(
            self.mcbs_alpha)

        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError(
                'Parameter error -- fractional window defined by --window-frac must be in (0,1]'
            )
示例#34
0
class WSelectTool(WESTParallelTool):
    prog = 'w_select'
    description = '''\
Select dynamics segments matching various criteria. This requires a
user-provided prediate function. By default, only matching segments are
stored. If the -a/--include-ancestors option is given, then matching segments
and their ancestors will be stored.


-----------------------------------------------------------------------------
Predicate function
-----------------------------------------------------------------------------

Segments are selected based on a predicate function, which must be callable
as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs
matching the predicate in that iteration.

The predicate may be inverted by specifying the -v/--invert command-line
argument.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "select.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration]
    *(Integer)* Number of segment IDs matching the predicate (or inverted
    predicate, if -v/--invert is specified) in the given iteration.

  ``/seg_ids`` [iteration][segment]
    *(Integer)* Matching segments in each iteration. For an iteration
    ``n_iter``, only the first ``n_iter`` entries are valid. For example,
    the full list of matching seg_ids in the first stored iteration is
    ``seg_ids[0][:n_segs[0]]``.

  ``/weights`` [iteration][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WSelectTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-p',
            '--predicate-function',
            metavar='MODULE.FUNCTION',
            help=
            '''Use the given predicate function to match segments. This function
                             should take an iteration number and the HDF5 group corresponding to that
                             iteration and return a sequence of seg_ids matching the predicate, as in
                             ``match_predicate(n_iter, iter_group)``.''')
        sgroup.add_argument('-v',
                            '--invert',
                            dest='invert',
                            action='store_true',
                            help='''Invert the match predicate.''')
        sgroup.add_argument(
            '-a',
            '--include-ancestors',
            action='store_true',
            help='''Include ancestors of matched segments in output.''')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='select.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        predicate = get_object(args.predicate_function, path=['.'])
        if not callable(predicate):
            raise TypeError(
                'predicate object {!r} is not callable'.format(predicate))
        self.predicate = predicate
        self.invert = bool(args.invert)
        self.include_ancestors = bool(args.include_ancestors)
        self.output_filename = args.output

    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, ))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, 0),
                                                maxshape=(iter_count, None),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, 1000000),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            #             futures = set()
            #             for n_iter in xrange(iter_start,iter_stop):
            #                 futures.add(self.work_manager.submit(_find_matching_segments,
            #                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

            #             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(
                ((_find_matching_segments,
                  (self.data_reader.we_h5filename, n_iter, self.predicate,
                   self.invert), {})
                 for n_iter in range(iter_start, iter_stop)),
                    self.max_queue_len):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter - iter_start] = n_matches
                    matching_segs_ds[n_iter -
                                     iter_start, :n_matches] = matching_ids
                    weights_ds[n_iter - iter_start, :
                               n_matches] = self.data_reader.get_iter_group(
                                   n_iter)['seg_index']['weight'][sorted(
                                       matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments',
                                 extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in range(iter_stop - 1, iter_start - 1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(
                            matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter, :n_matches] = matching_ids
                        weights_ds[
                            iiter, :
                            n_matches] = self.data_reader.get_iter_group(
                                n_iter)['seg_index']['weight'][sorted(
                                    matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)[
                            'seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(
                            parent_id for parent_id in parent_ids
                            if parent_id >= 0)  # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
示例#35
0
class StateProbTool(WESTParallelTool):
    prog = 'w_stateprobs'
    description = '''\
Calculate average populations and associated errors in state populations from
weighted ensemble data. Bin assignments, including macrostate definitions,
are required. (See "w_assign --help" for more information).

-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "stateprobs.h5") contains the following
dataset:

  /avg_state_pops [state]
    (Structured -- see below) Population of each state across entire
    range specified.

If --evolution-mode is specified, then the following additional dataset is
available:

  /state_pop_evolution [window][state]
    (Structured -- see below). State populations based on windows of
    iterations of varying width.  If --evolution-mode=cumulative, then
    these windows all begin at the iteration specified with
    --start-iter and grow in length by --step-iter for each successive 
    element. If --evolution-mode=blocked, then these windows are all of
    width --step-iter (excluding the last, which may be shorter), the first
    of which begins at iteration --start-iter.
    
The structure of these datasets is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.
    
  ci_lbound
    (Floating-point) Lower bound of the confidence interval on the rate
    within this window, in units of inverse tau.
    
  ci_ubound
    (Floating-point) Upper bound of the confidence interval on the rate 
    within this window, in units of inverse tau.
    
  corr_len
    (Integer) Correlation length of the rate within this window, in units
    of tau.

Each of these datasets is also stamped with a number of attributes:

  mcbs_alpha
    (Floating-point) Alpha value of confidence intervals. (For example, 
    *alpha=0.05* corresponds to a 95% confidence interval.)

  mcbs_nsets
    (Integer) Number of bootstrap data sets used in generating confidence
    intervals.
    
  mcbs_acalpha
    (Floating-point) Alpha value for determining correlation lengths.
   

-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(StateProbTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        self.kinetics_filename = None

        self.output_file = None
        self.assignments_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='stateprobs.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        cgroup = parser.add_argument_group(
            'confidence interval calculation options')
        cgroup.add_argument('--alpha',
                            type=float,
                            default=0.05,
                            help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument(
            '--autocorrel-alpha',
            type=float,
            dest='acalpha',
            metavar='ACALPHA',
            help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)'''
        )
        cgroup.add_argument(
            '--nsets',
            type=int,
            help=
            '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)'''
        )

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument(
            '-e',
            '--evolution-mode',
            choices=['cumulative', 'blocked', 'none'],
            default='none',
            help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.'''
        )

    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(
            self.assignments_filename,
            'r')  #, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(
                self.assignments_file):
            raise ValueError(
                'assignments data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(
                1,
                (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments

        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(
            self.mcbs_alpha)

        self.evolution_mode = args.evolution_mode

    def calc_state_pops(self):
        start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop
        nstates = self.nstates
        state_map = self.state_map
        iter_count = stop_iter - start_iter

        pi = self.progress.indicator
        pi.new_operation('Calculating state populations')
        pops = h5io.IterBlockedDataset(
            self.assignments_file['labeled_populations'])

        iter_state_pops = numpy.empty((nstates + 1, ), weight_dtype)
        all_state_pops = numpy.empty((iter_count, nstates + 1), weight_dtype)
        avg_state_pops = numpy.zeros((nstates + 1, ), weight_dtype)
        pops.cache_data(max_size='available')
        try:
            for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)):
                iter_state_pops.fill(0)
                labeled_pops = pops.iter_entry(n_iter)
                accumulate_state_populations_from_labeled(
                    labeled_pops,
                    state_map,
                    iter_state_pops,
                    check_state_map=False)
                all_state_pops[iiter] = iter_state_pops
                avg_state_pops += iter_state_pops
                del labeled_pops
                pi.progress += 1
        finally:
            pops.drop_cache()
        self.output_file.create_dataset('state_pops',
                                        data=all_state_pops,
                                        compression=9,
                                        shuffle=True)
        h5io.stamp_iter_range(self.output_file['state_pops'], start_iter,
                              stop_iter)

        self.all_state_pops = all_state_pops
        avg_state_pops = numpy.zeros((nstates + 1, ), ci_dtype)
        pi.new_operation('Calculating overall average populations and CIs',
                         nstates)

        #        futures = []
        #         for istate in xrange(nstates):
        #             futures.append(self.work_manager.submit(_eval_block,kwargs=dict(iblock=None,istate=istate,
        #                                                                             start=start_iter,stop=stop_iter,
        #                                                                             state_pops=all_state_pops[:,istate],
        #                                                                             mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
        #                                                                             mcbs_acalpha = self.mcbs_acalpha)))
        #         for future in self.work_manager.as_completed(futures):
        def taskgen():
            for istate in xrange(nstates):
                yield (_eval_block, (),
                       dict(iblock=None,
                            istate=istate,
                            start=start_iter,
                            stop=stop_iter,
                            state_pops=all_state_pops[:, istate],
                            mcbs_alpha=self.mcbs_alpha,
                            mcbs_nsets=self.mcbs_nsets,
                            mcbs_acalpha=self.mcbs_acalpha))

        for future in self.work_manager.submit_as_completed(
                taskgen(), self.max_queue_len):
            (_iblock, istate, ci_res) = future.get_result(discard=True)
            avg_state_pops[istate] = ci_res
            pi.progress += 1
        self.output_file['avg_state_pops'] = avg_state_pops
        self.stamp_mcbs_info(self.output_file['avg_state_pops'])
        pi.clear()

        maxlabellen = max(map(len, self.state_labels))
        print('average state populations:')
        for istate in xrange(nstates):
            print(
                '{:{maxlabellen}s}: mean={:21.15e} CI=({:21.15e}, {:21.15e})'.
                format(self.state_labels[istate],
                       avg_state_pops['expected'][istate],
                       avg_state_pops['ci_lbound'][istate],
                       avg_state_pops['ci_ubound'][istate],
                       maxlabellen=maxlabellen))

    def calc_evolution(self):
        nstates = self.nstates
        start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step
        start_pts = range(start_iter, stop_iter, step_iter)

        pop_evol = numpy.zeros((len(start_pts), nstates), dtype=ci_dtype)

        pi = self.progress.indicator
        pi.new_operation('Calculating population evolution',
                         len(start_pts) * nstates)

        #         futures = []
        #         for iblock, start in enumerate(start_pts):
        #             if self.evolution_mode == 'cumulative':
        #                 block_start = start_iter
        #             else: # self.evolution_mode == 'blocked'
        #                 block_start = start
        #             stop = min(start+step_iter, stop_iter)
        #
        #             for istate in xrange(nstates):
        #                 future = self.work_manager.submit(_eval_block,kwargs=dict(iblock=iblock,istate=istate,
        #                                                                           start=block_start,stop=stop,
        #                                                                           state_pops=self.all_state_pops[block_start-start_iter:stop-start_iter,istate],
        #                                                                           mcbs_alpha=self.mcbs_alpha, mcbs_nsets=self.mcbs_nsets,
        #                                                                           mcbs_acalpha = self.mcbs_acalpha))
        #                 futures.append(future)
        def taskgen():
            for iblock, start in enumerate(start_pts):
                if self.evolution_mode == 'cumulative':
                    block_start = start_iter
                else:  # self.evolution_mode == 'blocked'
                    block_start = start
                stop = min(start + step_iter, stop_iter)

                for istate in xrange(nstates):
                    yield (_eval_block, (),
                           dict(
                               iblock=iblock,
                               istate=istate,
                               start=block_start,
                               stop=stop,
                               state_pops=self.all_state_pops[block_start -
                                                              start_iter:stop -
                                                              start_iter,
                                                              istate],
                               mcbs_alpha=self.mcbs_alpha,
                               mcbs_nsets=self.mcbs_nsets,
                               mcbs_acalpha=self.mcbs_acalpha))

        #for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
                taskgen(), self.max_queue_len):
            (iblock, istate, ci_res) = future.get_result(discard=True)
            pop_evol[iblock, istate] = ci_res
            pi.progress += 1

        self.output_file.create_dataset('state_pop_evolution',
                                        data=pop_evol,
                                        shuffle=True,
                                        compression=9)
        pi.clear()

    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.nstates = self.assignments_file.attrs['nstates']

            state_labels = self.state_labels = self.assignments_file[
                'state_labels'][...]
            state_map = self.state_map = self.assignments_file['state_map'][
                ...]
            if (state_map > nstates).any():
                raise ValueError('invalid state mapping')

            # copy metadata to output
            self.output_file.attrs['nstates'] = nstates
            self.output_file['state_labels'] = state_labels

            # calculate overall averages
            self.calc_state_pops()

            # calculate evolution, if requested
            if self.evolution_mode != 'none' and self.iter_range.iter_step:
                self.calc_evolution()
示例#36
0
class KinAvgSubcommands(WESTSubcommand):
    '''Common argument processing for w_kinavg subcommands'''
    
    def __init__(self, parent):
        super(KinAvgSubcommands,self).__init__(parent)
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None
        
        self.evolution_mode = None
        
        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None
        
    def stamp_mcbs_info(self, dataset):
        dataset.attrs['mcbs_alpha'] = self.mcbs_alpha
        dataset.attrs['mcbs_acalpha'] = self.mcbs_acalpha
        dataset.attrs['mcbs_nsets'] = self.mcbs_nsets
        
            
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')
        
        # self.default_kinetics_file will be picked up as a class attribute from the appropriate subclass        
        iogroup.add_argument('-k', '--kinetics', default=self.default_kinetics_file,
                            help='''Populations and transition rates are stored in KINETICS
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='kinavg.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        
        cgroup = parser.add_argument_group('confidence interval calculation options')
        cgroup.add_argument('--alpha', type=float, default=0.05, 
                             help='''Calculate a (1-ALPHA) confidence interval'
                             (default: %(default)s)''')
        cgroup.add_argument('--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA',
                             help='''Evaluate autocorrelation to (1-ACALPHA) significance.
                             Note that too small an ACALPHA will result in failure to detect autocorrelation
                             in a noisy flux signal. (Default: same as ALPHA.)''')
        cgroup.add_argument('--nsets', type=int,
                             help='''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''')
        
        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked', 'none'], default='none',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.
                             ``none`` (the default) disables calculation of the time evolution of rate estimates.''')
        cogroup.add_argument('--window-frac', type=float, default=1.0,
                             help='''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError('kinetics data do not span the requested iterations')

    
    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics
                
        self.mcbs_alpha = args.alpha
        self.mcbs_acalpha = args.acalpha if args.acalpha else self.mcbs_alpha
        self.mcbs_nsets = args.nsets if args.nsets else mclib.get_bssize(self.mcbs_alpha)
        
        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError('Parameter error -- fractional window defined by --window-frac must be in (0,1]')
示例#37
0
class WCrawl(WESTParallelTool):
    prog='w_crawl'
    description = '''\
Crawl a weighted ensemble dataset, executing a function for each iteration.
This can be used for postprocessing of trajectories, cleanup of datasets,
or anything else that can be expressed as "do X for iteration N, then do
something with the result". Tasks are parallelized by iteration, and 
no guarantees are made about evaluation order.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
    
'''

    def __init__(self):
        super(WCrawl,self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        tgroup = parser.add_argument_group('task options')
        tgroup.add_argument('-c', '--crawler-instance',
                            help='''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of
                            WESTPACrawler to coordinate the calculation. Required only if initialization,
                            finalization, or task result processing is required.''')
        tgroup.add_argument('task_callable',
                            help='''Run TASK_CALLABLE (specified as module.function) on each iteration.
                            Required.''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        self.task_callable = get_object(args.task_callable, path=['.'])
        if args.crawler_instance is not None:
            self.crawler = get_object(args.crawler_instance, path=['.'])
        else:
            self.crawler = WESTPACrawler()

    def go(self):
        iter_start = self.iter_range.iter_start
        iter_stop = self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        self.data_reader.open('r')
        pi = self.progress.indicator
        with pi:
            pi.operation = 'Initializing'
            self.crawler.initialize(iter_start, iter_stop)

            try:
                pi.new_operation('Dispatching tasks & processing results', iter_count)
                task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in range(iter_start,iter_stop))
                for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
                    n_iter, result = future.get_result(discard=True)
                    if self.crawler is not None:
                        self.crawler.process_iter_result(n_iter,result)
                    pi.progress += 1
            finally:
                pi.new_operation('Finalizing')
                self.crawler.finalize()
示例#38
0
class WNetworker(WESTTool):
    prog = "w_networker"
    description = """\
Makes a network file from a transition matrix that can be visualized 
by most graph programs.

-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "network.gml") contains the network
as described by the transition matrix found in the transition matrix file

-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
"""

    def __init__(self):
        super(WNetworker, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_filename = None
        self.tm_filename = None
        self.postprocess_function = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        igroup = parser.add_argument_group("input options")
        # TODO: Get WESTPA h5 file and add some stuff from it into nodes
        igroup.add_argument(
            "-tm",
            "--transition-matrix",
            default="tm.h5",
            help="""Use transition matrix from the"""
            """resulting h5 file of w_reweigh (default: %(default)s).""",
        )

        ogroup = parser.add_argument_group("output options")
        ogroup.add_argument(
            "-o",
            "--output",
            default="network.gml",
            help="""Write output to OUTPUT (default: %(default)s).""",
        )

        ppgroup = parser.add_argument_group("postprocess options")
        ppgroup.add_argument(
            "--postprocess-function",
            help=
            """Names a function (as in module.function) that will be called just prior
                                  to saving the graph. The function will be called as ``postprocess(G, tm, prob)``
                                  where ``G`` is the fully built networkx graph, ``tm`` is the transition matrix
                                  used to build the graph and ``prob`` is the probability distribution used""",
        )
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        # Set the attributes according to arguments
        self.output_filename = args.output
        self.tm_filename = args.transition_matrix
        if args.postprocess_function:
            self.postprocess_function = get_object(args.postprocess_function,
                                                   path=["."])

    def _load_from_h5(self, fname, istart, istop):
        tmh5 = h5py.File(fname, "r")
        # We will need the number of rows and columns to convert from
        # sparse matrix format
        nrows = tmh5.attrs["nrows"]
        ncols = tmh5.attrs["ncols"]
        # gotta average over iterations
        tm = None
        for it in range(istart, istop):
            it_str = "iter_{:08}".format(it)
            col = tmh5["iterations"][it_str]["cols"]
            row = tmh5["iterations"][it_str]["rows"]
            flux = tmh5["iterations"][it_str]["flux"]
            ctm = coo_matrix((flux, (row, col)),
                             shape=(nrows, ncols)).toarray()
            if tm is None:
                tm = ctm
            else:
                tm += ctm
        # We need to convert the "non-markovian" matrix to
        # a markovian matrix here

        # TODO: support more than 2 states
        # Not as straight forward as it seems since there is the
        # "unknown" state to deal with and it requires a funky
        # fix to go from non-markovian to markovian matrix
        nstates = 2
        mnrows = int(nrows / nstates)
        mncols = int(ncols / nstates)
        mtm = numpy.zeros((mnrows, mncols), dtype=flux.dtype)
        for i in range(mnrows):
            for j in range(mncols):
                mtm[i, j] = tm[i * 2:(i + 1) * 2, j * 2:(j + 1) * 2].sum()
        mtm = mtm / len(tmh5["iterations"])
        # Let's also get probabilities
        bin_probs = tmh5["bin_populations"]
        avg_bin_probs = numpy.average(bin_probs[istart:istop],
                                      axis=0) / nstates
        prob = avg_bin_probs.reshape(mnrows, nstates).sum(axis=1)
        return mtm, prob

    def read_tmfile(self, fname, istart, istop):
        if fname.endswith(".h5"):
            tm, prob = self._load_from_h5(fname, istart, istop)
        else:
            # TODO: error out
            pass
        return tm, prob

    def save_graph(self, outname, graph):
        # determine save function
        if outname.endswith(".gml"):
            func = nx.write_gml
        else:
            # TODO: error out
            pass
        func(graph, outname)

    def go(self):
        self.data_reader.open("r")
        # Get the iterations we want to average the tm if needed
        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        # Read transition matrix and probabilities
        tm, prob = self.read_tmfile(self.tm_filename, iter_start, iter_stop)
        # Start the progress indicator and work on the graph
        pi = self.progress.indicator
        with pi:
            node_sizes = prob
            edge_sizes = tm

            pi.new_operation("Building graph, adding nodes",
                             extent=len(node_sizes))
            G = nx.DiGraph()
            for i in range(tm.shape[0]):
                if node_sizes[i] > 0:
                    G.add_node(i, weight=float(node_sizes[i]))
                pi.progress += 1

            pi.new_operation("Adding edges", extent=len(edge_sizes.flatten()))
            for i in range(tm.shape[0]):
                for j in range(tm.shape[1]):
                    if edge_sizes[i][j] > 0:
                        G.add_edge(i, j, weight=float(edge_sizes[i][j]))
                    pi.progress += 1

            if self.postprocess_function:
                self.postprocess_function(G, tm, prob)
            self.save_graph(self.output_filename, G)
class WPostAnalysisReweightTool(WESTTool):
    prog ='w_postanalysis_reweight'
    description = '''\
Calculate average rates from weighted ensemble data using the postanalysis
reweighting scheme. Bin assignments (usually "assignments.h5") and pre-calculated 
iteration flux matrices (usually "flux_matrices.h5") data files must have been 
previously generated using w_postanalysis_matrix.py (see "w_assign --help" and 
"w_kinetics --help" for information on generating these files).


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, usually "kinrw.h5") contains the following
dataset:

  /state_prob_evolution [window,state]
    The reweighted state populations based on windows

  /color_prob_evolution [window,state]
    The reweighted populations last assigned to each state based on windows

  /bin_prob_evolution [window, bin]
    The reweighted populations of each bin based on windows. Bins contain
    one color each, so to recover the original un-colored spatial bins,
    one must sum over all states.

  /conditional_flux_evolution [window,state,state]
    (Structured -- see below). State-to-state fluxes based on windows of
    varying width
    
The structure of the final dataset is as follows:

  iter_start
    (Integer) Iteration at which the averaging window begins (inclusive).
    
  iter_stop
    (Integer) Iteration at which the averaging window ends (exclusive).
    
  expected
    (Floating-point) Expected (mean) value of the rate as evaluated within
    this window, in units of inverse tau.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WPostAnalysisReweightTool, self).__init__()
        
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        
        self.output_filename = None
        self.kinetics_filename = None
        self.assignment_filename = None
        
        self.output_file = None
        self.assignments_file = None
        self.kinetics_file = None
        
        self.evolution_mode = None
        
    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        iogroup.add_argument('-k', '--kinetics', default='flux_matrices.h5',
                            help='''Per-iteration flux matrices calculated by w_postanalysis_matrix 
                            (default: %(default)s).''')
        iogroup.add_argument('-o', '--output', dest='output', default='kinrw.h5',
                            help='''Store results in OUTPUT (default: %(default)s).''')

        cogroup = parser.add_argument_group('calculation options')
        cogroup.add_argument('-e', '--evolution-mode', choices=['cumulative', 'blocked'], default='cumulative',
                             help='''How to calculate time evolution of rate estimates.
                             ``cumulative`` evaluates rates over windows starting with --start-iter and getting progressively
                             wider to --stop-iter by steps of --step-iter.
                             ``blocked`` evaluates rates over windows of width --step-iter, the first of which begins at
                             --start-iter.''')
        cogroup.add_argument('--window-frac', type=float, default=1.0,
                             help='''Fraction of iterations to use in each window when running in ``cumulative`` mode.
                             The (1 - frac) fraction of iterations will be discarded from the start of each window.''')

        cogroup.add_argument('--obs-threshold', type=int, default=1,
                             help='''The minimum number of observed transitions between two states i and j necessary to include
                             fluxes in the reweighting estimate''')
        
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_filename, 'w', creating_program=True)
        h5io.stamp_creator_data(self.output_file)
        self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False)
        if not self.iter_range.check_data_iter_range_least(self.assignments_file):
            raise ValueError('assignments data do not span the requested iterations')

        if not self.iter_range.check_data_iter_range_least(self.kinetics_file):
            raise ValueError('kinetics data do not span the requested iterations')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)
        
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.kinetics_filename = args.kinetics
                
        self.evolution_mode = args.evolution_mode
        self.evol_window_frac = args.window_frac
        if self.evol_window_frac <= 0 or self.evol_window_frac > 1:
            raise ValueError('Parameter error -- fractional window defined by --window-frac must be in (0,1]')
        self.obs_threshold = args.obs_threshold



    def go(self):
        pi = self.progress.indicator
        with pi:
            pi.new_operation('Initializing')
            self.open_files()
            nstates = self.assignments_file.attrs['nstates']
            nbins = self.assignments_file.attrs['nbins']
            state_labels = self.assignments_file['state_labels'][...]
            state_map = self.assignments_file['state_map'][...]
            nfbins = self.kinetics_file.attrs['nrows']
            npts = self.kinetics_file.attrs['npts']

            assert nstates == len(state_labels)
            assert nfbins == nbins * nstates

            start_iter, stop_iter, step_iter = self.iter_range.iter_start, self.iter_range.iter_stop, self.iter_range.iter_step

            start_pts = range(start_iter, stop_iter, step_iter)
            flux_evol = np.zeros((len(start_pts), nstates, nstates), dtype=ci_dtype)
            color_prob_evol = np.zeros((len(start_pts), nstates))
            state_prob_evol = np.zeros((len(start_pts), nstates))
            bin_prob_evol = np.zeros((len(start_pts), nfbins))
            pi.new_operation('Calculating flux evolution', len(start_pts))

            if self.evolution_mode == 'cumulative' and self.evol_window_frac == 1.0:
                print('Using fast streaming accumulation')

                total_fluxes = np.zeros((nfbins, nfbins), weight_dtype)
                total_obs = np.zeros((nfbins, nfbins), np.int64)

                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    stop = min(start + step_iter, stop_iter)

                    params = dict(start=start, stop=stop, nstates=nstates, nbins=nbins,
                                  state_labels=state_labels, state_map=state_map, nfbins=nfbins,
                                  total_fluxes=total_fluxes, total_obs=total_obs,
                                  h5file=self.kinetics_file, obs_threshold=self.obs_threshold)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(**params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][k,j] = rw_state_flux[k,j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k,j] = start
                            flux_evol[iblock]['iter_stop'][k,j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs


            else:
                for iblock, start in enumerate(start_pts):
                    pi.progress += 1
                    
                    stop = min(start + step_iter, stop_iter)
                    if self.evolution_mode == 'cumulative':
                        windowsize = max(1, int(self.evol_window_frac * (stop - start_iter)))
                        block_start = max(start_iter, stop - windowsize)
                    else:   # self.evolution_mode == 'blocked'
                        block_start = start

                    params = dict(start=block_start, stop=stop, nstates=nstates, nbins=nbins,
                                  state_labels=state_labels, state_map=state_map, nfbins=nfbins,
                                  total_fluxes=None, total_obs=None,
                                  h5file=self.kinetics_file)

                    rw_state_flux, rw_color_probs, rw_state_probs, rw_bin_probs, rw_bin_flux = reweight(**params)
                    for k in xrange(nstates):
                        for j in xrange(nstates):
                            # Normalize such that we report the flux per tau (tau being the weighted ensemble iteration)
                            # npts always includes a 0th time point
                            flux_evol[iblock]['expected'][k,j] = rw_state_flux[k,j] * (npts - 1)
                            flux_evol[iblock]['iter_start'][k,j] = start
                            flux_evol[iblock]['iter_stop'][k,j] = stop

                    color_prob_evol[iblock] = rw_color_probs
                    state_prob_evol[iblock] = rw_state_probs[:-1]
                    bin_prob_evol[iblock] = rw_bin_probs


            ds_flux_evol = self.output_file.create_dataset('conditional_flux_evolution', data=flux_evol, shuffle=True, compression=9)
            ds_state_prob_evol = self.output_file.create_dataset('state_prob_evolution', data=state_prob_evol, compression=9)
            ds_color_prob_evol = self.output_file.create_dataset('color_prob_evolution', data=color_prob_evol, compression=9)
            ds_bin_prob_evol = self.output_file.create_dataset('bin_prob_evolution', data=bin_prob_evol, compression=9)
            ds_state_labels = self.output_file.create_dataset('state_labels', data=state_labels)