def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File( self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = np.empty((self.binning.mapper.nbins + 1, ), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [ np.string_(label) for label in self.binning.mapper.labels ] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [ np.string_(state['label']) for state in self.states ] for istate, sdict in enumerate(self.states): assert state_labels[istate] == np.string_( sdict['label']) # sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file[ 'state_labels'] = state_labels # + ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates # Stamp if this has been subsampled. self.output_file.attrs['subsampled'] = self.subsample iter_count = iter_stop - iter_start nsegs = np.empty((iter_count, ), seg_id_dtype) npts = np.empty((iter_count, ), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation('Scanning for segment and point counts', iter_stop - iter_start) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count, max_nsegs, max_npts) assignments_dtype = np.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset( 'assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins, ) if self.states: trajlabel_dtype = np.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset( 'trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates, ) statelabels_ds = self.output_file.create_dataset( 'statelabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates, ) pops_shape = (iter_count, nstates + 1, nbins + 1) pops_ds = self.output_file.create_dataset( 'labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype), ) h5io.label_axes( pops_ds, [np.string_(i) for i in ['iteration', 'state', 'bin']]) pi.new_operation('Assigning to bins', iter_stop - iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(range(iter_start, iter_stop)): # get iteration info in this block if iiter == 0: last_labels = np.empty((nsegs[iiter], ), index_dtype) last_labels[:] = nstates # unknown state # Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops, statelabels = self.assign_iteration( n_iter, nstates, nbins, state_map, last_labels) # Do stuff with this iteration's results last_labels = trajlabels[:, -1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels statelabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = statelabels pi.progress += 1 del assignments, trajlabels, pops, statelabels for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)
def w_postanalysis_matrix(self): pi = self.progress.indicator pi.new_operation('Initializing') self.data_reader.open('r') nbins = self.assignments_file.attrs['nbins'] state_labels = self.assignments_file['state_labels'][...] state_map = self.assignments_file['state_map'][...] nstates = len(state_labels) start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter nfbins = nbins * nstates flux_shape = (iter_count, nfbins, nfbins) pop_shape = (iter_count, nfbins) h5io.stamp_iter_range(self.output_file, start_iter, stop_iter) bin_populations_ds = self.output_file.create_dataset( 'bin_populations', shape=pop_shape, dtype=weight_dtype) h5io.stamp_iter_range(bin_populations_ds, start_iter, stop_iter) h5io.label_axes(bin_populations_ds, ['iteration', 'bin']) flux_grp = self.output_file.create_group('iterations') self.output_file.attrs['nrows'] = nfbins self.output_file.attrs['ncols'] = nfbins fluxes = np.empty(flux_shape[1:], weight_dtype) populations = np.empty(pop_shape[1:], weight_dtype) trans = np.empty(flux_shape[1:], np.int64) # Check to make sure this isn't a data set with target states #tstates = self.data_reader.data_manager.get_target_states(0) #if len(tstates) > 0: # raise ValueError('Postanalysis reweighting analysis does not support WE simulation run under recycling conditions') pi.new_operation('Calculating flux matrices', iter_count) # Calculate instantaneous statistics for iiter, n_iter in enumerate(range(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) bin_assignments = np.require( self.assignments_file['assignments'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) mask_unknown = np.zeros_like(bin_assignments, dtype=np.uint16) macrostate_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) macrostate_assignments = np.require( self.assignments_file['trajlabels'][macrostate_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) # Transform bin_assignments to take macrostate membership into account bin_assignments = nstates * bin_assignments + macrostate_assignments mask_indx = np.where(macrostate_assignments == nstates) mask_unknown[mask_indx] = 1 # Calculate bin-to-bin fluxes, bin populations and number of obs transitions calc_stats(bin_assignments, weights, fluxes, populations, trans, mask_unknown, self.sampling_frequency) # Store bin-based kinetics data bin_populations_ds[iiter] = populations # Setup sparse data structures for flux and obs fluxes_sp = sp.coo_matrix(fluxes) trans_sp = sp.coo_matrix(trans) assert fluxes_sp.nnz == trans_sp.nnz flux_iter_grp = flux_grp.create_group('iter_{:08d}'.format(n_iter)) flux_iter_grp.create_dataset('flux', data=fluxes_sp.data, dtype=weight_dtype) flux_iter_grp.create_dataset('obs', data=trans_sp.data, dtype=np.int32) flux_iter_grp.create_dataset('rows', data=fluxes_sp.row, dtype=np.int32) flux_iter_grp.create_dataset('cols', data=fluxes_sp.col, dtype=np.int32) flux_iter_grp.attrs['nrows'] = nfbins flux_iter_grp.attrs['ncols'] = nfbins # Do a little manual clean-up to prevent memory explosion del iter_group, weights, bin_assignments del macrostate_assignments pi.progress += 1 # Check and save the number of intermediate time points; this will be used to normalize the # flux and kinetics to tau in w_postanalysis_reweight. if self.assignments_file.attrs[ 'subsampled'] == True or self.sampling_frequency == 'iteration': self.output_file.attrs['npts'] = 2 else: #self.output_file.attrs['npts'] = npts if self.sampling_frequency == 'timepoint' else 2 self.output_file.attrs['npts'] = npts
def w_kinetics(self): pi = self.progress.indicator pi.new_operation('Initializing') self.data_reader.open('r') self.open_files() nstates = self.assignments_file.attrs['nstates'] start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter durations_ds = self.output_file.replace_dataset( 'durations', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=ed_list_dtype, chunks=(1, 15360) if self.do_compression else None, shuffle=self.do_compression, compression=9 if self.do_compression else None, ) durations_count_ds = self.output_file.replace_dataset( 'duration_count', shape=(iter_count, ), dtype=np.int_, shuffle=True, compression=9) cond_fluxes_ds = self.output_file.replace_dataset( 'conditional_fluxes', shape=(iter_count, nstates, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) total_fluxes_ds = self.output_file.replace_dataset( 'total_fluxes', shape=(iter_count, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) cond_arrival_counts_ds = self.output_file.replace_dataset( 'conditional_arrivals', shape=(iter_count, nstates, nstates), dtype=np.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), np.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) arrival_counts_ds = self.output_file.replace_dataset( 'arrivals', shape=(iter_count, nstates), dtype=np.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates), np.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) # copy state labels for convenience self.output_file.replace_dataset( 'state_labels', data=self.assignments_file['state_labels'][...]) # Put nice labels on things for ds in (self.output_file, durations_count_ds, cond_fluxes_ds, total_fluxes_ds): h5io.stamp_iter_range(ds, start_iter, stop_iter) # Calculate instantaneous rate matrices and trace trajectories last_state = None pi.new_operation('Tracing trajectories', iter_count) for iiter, n_iter in enumerate(range(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] # parent_ids = seg_index['parent_id'] parent_ids = self.data_reader.parent_id_dsspec.get_iter_data( n_iter) # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) bin_assignments = np.require( self.assignments_file['assignments'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) label_assignments = np.require( self.assignments_file['trajlabels'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) state_assignments = np.require( self.assignments_file['statelabels'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) # Prepare to run analysis cond_fluxes = np.zeros((nstates, nstates), weight_dtype) total_fluxes = np.zeros((nstates, ), weight_dtype) cond_counts = np.zeros((nstates, nstates), np.uint) total_counts = np.zeros((nstates, ), np.uint) durations = [] # Estimate macrostate fluxes and calculate event durations using trajectory tracing # state is opaque to the find_macrostate_transitions function dt = 1.0 if npts == 1 else 1.0 / (npts - 1) state = _fast_transition_state_copy(iiter, nstates, parent_ids, last_state) find_macrostate_transitions( nstates, weights, label_assignments, state_assignments, dt, state, cond_fluxes, cond_counts, total_fluxes, total_counts, durations, ) last_state = state # Store trace-based kinetics data cond_fluxes_ds[iiter] = cond_fluxes total_fluxes_ds[iiter] = total_fluxes arrival_counts_ds[iiter] = total_counts cond_arrival_counts_ds[iiter] = cond_counts durations_count_ds[iiter] = len(durations) if len(durations) > 0: durations_ds.resize( (iter_count, max(len(durations), durations_ds.shape[1]))) durations_ds[iiter, :len(durations)] = durations # Do a little manual clean-up to prevent memory explosion del iter_group, weights, parent_ids, bin_assignments, label_assignments, state, cond_fluxes, total_fluxes pi.progress += 1