def go(self): pi = self.progress.indicator pi.new_operation('Initializing') with pi: self.data_reader.open('r') nstates = self.assignments_file.attrs['nstates'] start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter durations_ds = self.output_file.create_dataset( 'durations', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=ed_list_dtype, chunks=(1, 15360) if self.do_compression else None, shuffle=self.do_compression, compression=9 if self.do_compression else None) durations_count_ds = self.output_file.create_dataset( 'duration_count', shape=(iter_count, ), dtype=numpy.int_, shuffle=True, compression=9) cond_fluxes_ds = self.output_file.create_dataset( 'conditional_fluxes', shape=(iter_count, nstates, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) total_fluxes_ds = self.output_file.create_dataset( 'total_fluxes', shape=(iter_count, nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize( (iter_count, nstates), weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) cond_arrival_counts_ds = self.output_file.create_dataset( 'conditional_arrivals', shape=(iter_count, nstates, nstates), dtype=numpy.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates, nstates), numpy.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) arrival_counts_ds = self.output_file.create_dataset( 'arrivals', shape=(iter_count, nstates), dtype=numpy.uint, chunks=(h5io.calc_chunksize( (iter_count, nstates), numpy.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) # copy state labels for convenience self.output_file['state_labels'] = self.assignments_file[ 'state_labels'][...] # Put nice labels on things for ds in (self.output_file, durations_count_ds, cond_fluxes_ds, total_fluxes_ds): h5io.stamp_iter_range(ds, start_iter, stop_iter) # Calculate instantaneous rate matrices and trace trajectories last_state = None pi.new_operation('Tracing trajectories', iter_count) for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] #parent_ids = seg_index['parent_id'] parent_ids = self.data_reader.parent_id_dsspec.get_iter_data( n_iter) # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry( self.assignments_file, n_iter) bin_assignments = numpy.require( self.assignments_file['assignments'][ assignment_iiter + numpy.s_[:nsegs, :npts]], dtype=index_dtype) label_assignments = numpy.require( self.assignments_file['trajlabels'][ assignment_iiter + numpy.s_[:nsegs, :npts]], dtype=index_dtype) # Prepare to run analysis cond_fluxes = numpy.zeros((nstates, nstates), weight_dtype) total_fluxes = numpy.zeros((nstates, ), weight_dtype) cond_counts = numpy.zeros((nstates, nstates), numpy.uint) total_counts = numpy.zeros((nstates, ), numpy.uint) durations = [] # Estimate macrostate fluxes and calculate event durations using trajectory tracing # state is opaque to the find_macrostate_transitions function state = _fast_transition_state_copy(iiter, nstates, parent_ids, last_state) find_macrostate_transitions(nstates, weights, label_assignments, 1.0 / (npts - 1), state, cond_fluxes, cond_counts, total_fluxes, total_counts, durations) last_state = state # Store trace-based kinetics data cond_fluxes_ds[iiter] = cond_fluxes total_fluxes_ds[iiter] = total_fluxes arrival_counts_ds[iiter] = total_counts cond_arrival_counts_ds[iiter] = cond_counts durations_count_ds[iiter] = len(durations) if len(durations) > 0: durations_ds.resize( (iter_count, max(len(durations), durations_ds.shape[1]))) durations_ds[iiter, :len(durations)] = durations # Do a little manual clean-up to prevent memory explosion del iter_group, weights, parent_ids, bin_assignments, label_assignments, state, cond_fluxes, total_fluxes pi.progress += 1
def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=numpy.uint, shape=(iter_count, )) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, 1000000), weight_dtype), shuffle=True, compression=9) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_find_matching_segments, (self.data_reader.we_h5filename, n_iter, self.predicate, self.invert), {}) for n_iter in range(iter_start, iter_stop)), self.max_queue_len): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches seg_count_ds[n_iter - iter_start] = n_matches matching_segs_ds[n_iter - iter_start, :n_matches] = matching_ids weights_ds[n_iter - iter_start, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in range(iter_stop - 1, iter_start - 1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update( matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter, :n_matches] = matching_ids weights_ds[ iiter, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)[ 'seg_index']['parent_id'][sorted(matching_ids)] from_previous.update( parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1
def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins']+1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(assignments_file['nsegs'], iter_start,iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop)) seg_count_ds = output_file.create_dataset('nsegs', dtype=numpy.uint, shape=(iter_count,nbins)) matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,nbins,count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count,nbins,count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count,nbins,count), dtype=weight_dtype, chunks=h5io.calc_chunksize((iter_count,nbins,count), weight_dtype), shuffle=True,compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(xrange(iter_start, iter_stop)): assignments = numpy.require(assignments_ds[h5io.get_iteration_entry(assignments_ds, n_iter) + numpy.index_exp[:,timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table(nsegs[iiter],nbins,assignments) for ibin in xrange(nbins): segs = numpy.nonzero(segs_by_bin[:,ibin])[0] seg_count_ds[iiter,ibin] = min(len(segs),count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = numpy.argsort(weights)[:count] elif what == 'highweight': indices = numpy.argsort(weights)[::-1][:count] else: assert what == 'random' indices = numpy.random.permutation(len(weights)) matching_segs_ds[iiter,ibin,:len(segs)] = segs.take(indices) weights_ds[iiter,ibin,:len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=range(iter_start,iter_stop)) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=numpy.uint, shape=(iter_count,)) matching_segs_ds = output_file.create_dataset('seg_ids', shape=(iter_count,0), maxshape=(iter_count,None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count,1000000), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count,0), maxshape=(iter_count,None), dtype=weight_dtype, chunks=h5io.calc_chunksize((iter_count,1000000), weight_dtype), shuffle=True,compression=9) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed(((_find_matching_segments, (self.data_reader.we_h5filename,n_iter,self.predicate,self.invert), {}) for n_iter in xrange(iter_start,iter_stop)), self.max_queue_len): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count,n_matches)) weights_ds.resize((iter_count,n_matches)) current_seg_count = n_matches seg_count_ds[n_iter-iter_start] = n_matches matching_segs_ds[n_iter-iter_start,:n_matches] = matching_ids weights_ds[n_iter-iter_start,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in xrange(iter_stop-1, iter_start-1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update(matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count,n_matches)) weights_ds.resize((iter_count,n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter,:n_matches] = matching_ids weights_ds[iiter,:n_matches] = self.data_reader.get_iter_group(n_iter)['seg_index']['weight'][sorted(matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)['seg_index']['parent_id'][sorted(matching_ids)] from_previous.update(parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1
def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File( self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = numpy.empty((self.binning.mapper.nbins + 1, ), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [ numpy.string_(label) for label in self.binning.mapper.labels ] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [ numpy.string_(state['label']) for state in self.states ] for istate, sdict in enumerate(self.states): assert state_labels[istate] == numpy.string_( sdict['label']) #sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file[ 'state_labels'] = state_labels #+ ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates # Stamp if this has been subsampled. self.output_file.attrs['subsampled'] = self.subsample iter_count = iter_stop - iter_start nsegs = numpy.empty((iter_count, ), seg_id_dtype) npts = numpy.empty((iter_count, ), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation('Scanning for segment and point counts', iter_stop - iter_start) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count, max_nsegs, max_npts) assignments_dtype = numpy.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset( 'assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins) if self.states: trajlabel_dtype = numpy.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset( 'trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) statelabels_ds = self.output_file.create_dataset( 'statelabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) pops_shape = (iter_count, nstates + 1, nbins + 1) pops_ds = self.output_file.create_dataset( 'labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype)) h5io.label_axes( pops_ds, [numpy.string_(i) for i in ['iteration', 'state', 'bin']]) pi.new_operation('Assigning to bins', iter_stop - iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(range(iter_start, iter_stop)): #get iteration info in this block if iiter == 0: last_labels = numpy.empty((nsegs[iiter], ), index_dtype) last_labels[:] = nstates #unknown state #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops, statelabels = self.assign_iteration( n_iter, nstates, nbins, state_map, last_labels) ##Do stuff with this iteration's results last_labels = trajlabels[:, -1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels statelabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = statelabels pi.progress += 1 del assignments, trajlabels, pops, statelabels for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)
def go(self): pi = self.progress.indicator pi.new_operation('Initializing') with pi: self.data_reader.open('r') nstates = self.assignments_file.attrs['nstates'] start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop # h5io.get_iter_range(self.assignments_file) iter_count = stop_iter - start_iter durations_ds = self.output_file.create_dataset('durations', shape=(iter_count,0), maxshape=(iter_count,None), dtype=ed_list_dtype, chunks=(1,15360) if self.do_compression else None, shuffle=self.do_compression, compression=9 if self.do_compression else None) durations_count_ds = self.output_file.create_dataset('duration_count', shape=(iter_count,), dtype=numpy.int_, shuffle=True,compression=9) cond_fluxes_ds = self.output_file.create_dataset('conditional_fluxes', shape=(iter_count,nstates,nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize((iter_count,nstates,nstates),weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) total_fluxes_ds = self.output_file.create_dataset('total_fluxes', shape=(iter_count,nstates), dtype=weight_dtype, chunks=(h5io.calc_chunksize((iter_count,nstates),weight_dtype) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) cond_arrival_counts_ds = self.output_file.create_dataset('conditional_arrivals', shape=(iter_count,nstates,nstates), dtype=numpy.uint, chunks=(h5io.calc_chunksize((iter_count,nstates,nstates), numpy.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) arrival_counts_ds = self.output_file.create_dataset('arrivals', shape=(iter_count,nstates), dtype=numpy.uint, chunks=(h5io.calc_chunksize((iter_count,nstates), numpy.uint) if self.do_compression else None), shuffle=self.do_compression, compression=9 if self.do_compression else None) # copy state labels for convenience self.output_file['state_labels'] = self.assignments_file['state_labels'][...] # Put nice labels on things for ds in (self.output_file, durations_count_ds, cond_fluxes_ds, total_fluxes_ds): h5io.stamp_iter_range(ds, start_iter, stop_iter) # Calculate instantaneous rate matrices and trace trajectories last_state = None pi.new_operation('Tracing trajectories', iter_count) for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)): # Get data from the main HDF5 file iter_group = self.data_reader.get_iter_group(n_iter) seg_index = iter_group['seg_index'] nsegs, npts = iter_group['pcoord'].shape[0:2] weights = seg_index['weight'] #parent_ids = seg_index['parent_id'] parent_ids = self.data_reader.parent_id_dsspec.get_iter_data(n_iter) # Get bin and traj. ensemble assignments from the previously-generated assignments file assignment_iiter = h5io.get_iteration_entry(self.assignments_file, n_iter) bin_assignments = numpy.require(self.assignments_file['assignments'][assignment_iiter + numpy.s_[:nsegs,:npts]], dtype=index_dtype) label_assignments = numpy.require(self.assignments_file['trajlabels'][assignment_iiter + numpy.s_[:nsegs,:npts]], dtype=index_dtype) # Prepare to run analysis cond_fluxes = numpy.zeros((nstates,nstates), weight_dtype) total_fluxes = numpy.zeros((nstates,), weight_dtype) cond_counts = numpy.zeros((nstates,nstates), numpy.uint) total_counts = numpy.zeros((nstates,), numpy.uint) durations = [] # Estimate macrostate fluxes and calculate event durations using trajectory tracing # state is opaque to the find_macrostate_transitions function state = _fast_transition_state_copy(iiter, nstates, parent_ids, last_state) find_macrostate_transitions(nstates, weights, label_assignments, 1.0/(npts-1), state, cond_fluxes, cond_counts, total_fluxes, total_counts, durations) last_state = state # Store trace-based kinetics data cond_fluxes_ds[iiter] = cond_fluxes total_fluxes_ds[iiter] = total_fluxes arrival_counts_ds[iiter] = total_counts cond_arrival_counts_ds[iiter] = cond_counts durations_count_ds[iiter] = len(durations) if len(durations) > 0: durations_ds.resize((iter_count, max(len(durations), durations_ds.shape[1]))) durations_ds[iiter,:len(durations)] = durations # Do a little manual clean-up to prevent memory explosion del iter_group, weights, parent_ids, bin_assignments, label_assignments, state, cond_fluxes, total_fluxes pi.progress += 1
def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=numpy.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = numpy.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + numpy.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = numpy.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = numpy.argsort(weights)[:count] elif what == 'highweight': indices = numpy.argsort(weights)[::-1][:count] else: assert what == 'random' indices = numpy.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
def go(self): assert self.data_reader.parent_id_dsspec._h5file is None assert self.data_reader.weight_dsspec._h5file is None if hasattr(self.dssynth.dsspec, '_h5file'): assert self.dssynth.dsspec._h5file is None pi = self.progress.indicator pi.operation = 'Initializing' with pi, self.data_reader, WESTPAH5File(self.output_filename, 'w', creating_program=True) as self.output_file: assign = self.binning.mapper.assign # We always assign the entire simulation, so that no trajectory appears to start # in a transition region that doesn't get initialized in one. iter_start = 1 iter_stop = self.data_reader.current_iteration h5io.stamp_iter_range(self.output_file, iter_start, iter_stop) nbins = self.binning.mapper.nbins self.output_file.attrs['nbins'] = nbins state_map = numpy.empty((self.binning.mapper.nbins+1,), index_dtype) state_map[:] = 0 # state_id == nstates => unknown state # Recursive mappers produce a generator rather than a list of labels # so consume the entire generator into a list labels = [label for label in self.binning.mapper.labels] self.output_file.create_dataset('bin_labels', data=labels, compression=9) if self.states: nstates = len(self.states) state_map[:] = nstates # state_id == nstates => unknown state state_labels = [state['label'] for state in self.states] for istate, sdict in enumerate(self.states): assert state_labels[istate] == sdict['label'] #sanity check state_assignments = assign(sdict['coords']) for assignment in state_assignments: state_map[assignment] = istate self.output_file.create_dataset('state_map', data=state_map, compression=9, shuffle=True) self.output_file['state_labels'] = state_labels #+ ['(unknown)'] else: nstates = 0 self.output_file.attrs['nstates'] = nstates iter_count = iter_stop - iter_start nsegs = numpy.empty((iter_count,), seg_id_dtype) npts = numpy.empty((iter_count,), seg_id_dtype) # scan for largest number of segments and largest number of points pi.new_operation ('Scanning for segment and point counts', iter_stop-iter_start) for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)): iter_group = self.data_reader.get_iter_group(n_iter) nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2] pi.progress += 1 del iter_group pi.new_operation('Preparing output') # create datasets self.output_file.create_dataset('nsegs', data=nsegs, shuffle=True, compression=9) self.output_file.create_dataset('npts', data=npts, shuffle=True, compression=9) max_nsegs = nsegs.max() max_npts = npts.max() assignments_shape = (iter_count,max_nsegs,max_npts) assignments_dtype = numpy.min_scalar_type(nbins) assignments_ds = self.output_file.create_dataset('assignments', dtype=assignments_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, assignments_dtype), fillvalue=nbins) if self.states: trajlabel_dtype = numpy.min_scalar_type(nstates) trajlabels_ds = self.output_file.create_dataset('trajlabels', dtype=trajlabel_dtype, shape=assignments_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(assignments_shape, trajlabel_dtype), fillvalue=nstates) pops_shape = (iter_count,nstates+1,nbins+1) pops_ds = self.output_file.create_dataset('labeled_populations', dtype=weight_dtype, shape=pops_shape, compression=4, shuffle=True, chunks=h5io.calc_chunksize(pops_shape, weight_dtype)) h5io.label_axes(pops_ds, ['iteration', 'state', 'bin']) pi.new_operation('Assigning to bins', iter_stop-iter_start) last_labels = None # mapping of seg_id to last macrostate inhabited for iiter, n_iter in enumerate(xrange(iter_start,iter_stop)): #get iteration info in this block if iiter == 0: last_labels = numpy.empty((nsegs[iiter],), index_dtype) last_labels[:] = nstates #unknown state #Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together assignments, trajlabels, pops = self.assign_iteration(n_iter, nstates, nbins, state_map, last_labels) ##Do stuff with this iteration's results last_labels = trajlabels[:,-1].copy() assignments_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = assignments pops_ds[iiter] = pops if self.states: trajlabels_ds[iiter, 0:nsegs[iiter], 0:npts[iiter]] = trajlabels pi.progress += 1 del assignments, trajlabels, pops for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations': h5io.stamp_iter_range(self.output_file[dsname], iter_start, iter_stop)