def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=np.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = np.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + np.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = np.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = np.argsort(weights)[:count] elif what == 'highweight': indices = np.argsort(weights)[::-1][:count] else: assert what == 'random' indices = np.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
def w_postanalysis_matrix(self): pi = self.progress.indicator pi.new_operation('Initializing') self.data_reader.open('r') nbins = self.assignments_file.attrs['nbins'] state_labels = self.assignments_file['state_labels'][...] state_map = self.assignments_file['state_map'][...] nstates = len(state_labels) start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter nfbins = nbins * nstates flux_shape = (iter_count, nfbins, nfbins) pop_shape = (iter_count, nfbins) h5io.stamp_iter_range(self.output_file, start_iter, stop_iter) bin_populations_ds = self.output_file.create_dataset( 'bin_populations', shape=pop_shape, dtype=weight_dtype) h5io.stamp_iter_range(bin_populations_ds, start_iter, stop_iter) h5io.label_axes(bin_populations_ds, ['iteration', 'bin']) flux_grp = self.output_file.create_group('iterations') self.output_file.attrs['nrows'] = nfbins self.output_file.attrs['ncols'] = nfbins fluxes = np.empty(flux_shape[1:], weight_dtype) populations = np.empty(pop_shape[1:], weight_dtype) trans = np.empty(flux_shape[1:], np.int64) # Check to make sure this isn't a data set with target states #tstates = self.data_reader.data_manager.get_target_states(0) #if len(tstates) > 0: # raise ValueError('Postanalysis reweighting analysis does not support WE simulation run under recycling conditions') pi.new_operation('Calculating flux matrices', iter_count) # Calculate instantaneous statistics for iiter, n_iter in enumerate(range(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) bin_assignments = np.require( self.assignments_file['assignments'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) mask_unknown = np.zeros_like(bin_assignments, dtype=np.uint16) macrostate_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) macrostate_assignments = np.require( self.assignments_file['trajlabels'][macrostate_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) # Transform bin_assignments to take macrostate membership into account bin_assignments = nstates * bin_assignments + macrostate_assignments mask_indx = np.where(macrostate_assignments == nstates) mask_unknown[mask_indx] = 1 # Calculate bin-to-bin fluxes, bin populations and number of obs transitions calc_stats(bin_assignments, weights, fluxes, populations, trans, mask_unknown, self.sampling_frequency) # Store bin-based kinetics data bin_populations_ds[iiter] = populations # Setup sparse data structures for flux and obs fluxes_sp = sp.coo_matrix(fluxes) trans_sp = sp.coo_matrix(trans) assert fluxes_sp.nnz == trans_sp.nnz flux_iter_grp = flux_grp.create_group('iter_{:08d}'.format(n_iter)) flux_iter_grp.create_dataset('flux', data=fluxes_sp.data, dtype=weight_dtype) flux_iter_grp.create_dataset('obs', data=trans_sp.data, dtype=np.int32) flux_iter_grp.create_dataset('rows', data=fluxes_sp.row, dtype=np.int32) flux_iter_grp.create_dataset('cols', data=fluxes_sp.col, dtype=np.int32) flux_iter_grp.attrs['nrows'] = nfbins flux_iter_grp.attrs['ncols'] = nfbins # Do a little manual clean-up to prevent memory explosion del iter_group, weights, bin_assignments del macrostate_assignments pi.progress += 1 # Check and save the number of intermediate time points; this will be used to normalize the # flux and kinetics to tau in w_postanalysis_reweight. if self.assignments_file.attrs[ 'subsampled'] == True or self.sampling_frequency == 'iteration': self.output_file.attrs['npts'] = 2 else: #self.output_file.attrs['npts'] = npts if self.sampling_frequency == 'timepoint' else 2 self.output_file.attrs['npts'] = npts
def w_kinetics(self): pi = self.progress.indicator pi.new_operation('Initializing') self.data_reader.open('r') self.open_files() nstates = self.assignments_file.attrs['nstates'] start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter durations_ds = self.output_file.replace_dataset( 'durations', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=ed_list_dtype, chunks=(1, 15360) if self.do_compression else None, shuffle=self.do_compression, compression=9 if self.do_compression else None, ) durations_count_ds = self.output_file.replace_dataset( 'duration_count', shape=(iter_count, ), dtype=np.int_, shuffle=True, compression=9) cond_fluxes_ds = self.output_file.replace_dataset( 'conditional_fluxes', shape=(iter_count, nstates, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) total_fluxes_ds = self.output_file.replace_dataset( 'total_fluxes', shape=(iter_count, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) cond_arrival_counts_ds = self.output_file.replace_dataset( 'conditional_arrivals', shape=(iter_count, nstates, nstates), dtype=np.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), np.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) arrival_counts_ds = self.output_file.replace_dataset( 'arrivals', shape=(iter_count, nstates), dtype=np.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates), np.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None, ) # copy state labels for convenience self.output_file.replace_dataset( 'state_labels', data=self.assignments_file['state_labels'][...]) # Put nice labels on things for ds in (self.output_file, durations_count_ds, cond_fluxes_ds, total_fluxes_ds): h5io.stamp_iter_range(ds, start_iter, stop_iter) # Calculate instantaneous rate matrices and trace trajectories last_state = None pi.new_operation('Tracing trajectories', iter_count) for iiter, n_iter in enumerate(range(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] # parent_ids = seg_index['parent_id'] parent_ids = self.data_reader.parent_id_dsspec.get_iter_data( n_iter) # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) bin_assignments = np.require( self.assignments_file['assignments'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) label_assignments = np.require( self.assignments_file['trajlabels'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) state_assignments = np.require( self.assignments_file['statelabels'][assignment_iiter + np.s_[:nsegs, :npts]], dtype=index_dtype) # Prepare to run analysis cond_fluxes = np.zeros((nstates, nstates), weight_dtype) total_fluxes = np.zeros((nstates, ), weight_dtype) cond_counts = np.zeros((nstates, nstates), np.uint) total_counts = np.zeros((nstates, ), np.uint) durations = [] # Estimate macrostate fluxes and calculate event durations using trajectory tracing # state is opaque to the find_macrostate_transitions function dt = 1.0 if npts == 1 else 1.0 / (npts - 1) state = _fast_transition_state_copy(iiter, nstates, parent_ids, last_state) find_macrostate_transitions( nstates, weights, label_assignments, state_assignments, dt, state, cond_fluxes, cond_counts, total_fluxes, total_counts, durations, ) last_state = state # Store trace-based kinetics data cond_fluxes_ds[iiter] = cond_fluxes total_fluxes_ds[iiter] = total_fluxes arrival_counts_ds[iiter] = total_counts cond_arrival_counts_ds[iiter] = cond_counts durations_count_ds[iiter] = len(durations) if len(durations) > 0: durations_ds.resize( (iter_count, max(len(durations), durations_ds.shape[1]))) durations_ds[iiter, :len(durations)] = durations # Do a little manual clean-up to prevent memory explosion del iter_group, weights, parent_ids, bin_assignments, label_assignments, state, cond_fluxes, total_fluxes pi.progress += 1