Пример #1
0
    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins'] + 1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(
            assignments_file['nsegs'], iter_start, iter_stop)]

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))

        seg_count_ds = output_file.create_dataset('nsegs',
                                                  dtype=np.uint,
                                                  shape=(iter_count, nbins))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, nbins, count),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, nbins, count),
                                       seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, nbins,
                                                       count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, nbins, count),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                assignments = np.require(assignments_ds[
                    h5io.get_iteration_entry(assignments_ds, n_iter) +
                    np.index_exp[:, timepoint]],
                                         dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(
                    n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(
                    nsegs[iiter], nbins, assignments)
                for ibin in range(nbins):
                    segs = np.nonzero(segs_by_bin[:, ibin])[0]

                    seg_count_ds[iiter, ibin] = min(len(segs), count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = np.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = np.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = np.random.permutation(len(weights))

                        matching_segs_ds[iiter,
                                         ibin, :len(segs)] = segs.take(indices)
                        weights_ds[iiter,
                                   ibin, :len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
Пример #2
0
    def w_kinetics(self):
        pi = self.progress.indicator
        pi.new_operation('Initializing')

        self.data_reader.open('r')
        self.open_files()
        nstates = self.assignments_file.attrs['nstates']
        start_iter, stop_iter = self.iter_range.iter_start, self.iter_range.iter_stop  # h5io.get_iter_range(self.assignments_file)
        iter_count = stop_iter - start_iter
        durations_ds = self.output_file.replace_dataset(
            'durations',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=ed_list_dtype,
            chunks=(1, 15360) if self.do_compression else None,
            shuffle=self.do_compression,
            compression=9 if self.do_compression else None,
        )
        durations_count_ds = self.output_file.replace_dataset(
            'duration_count',
            shape=(iter_count, ),
            dtype=np.int_,
            shuffle=True,
            compression=9)
        cond_fluxes_ds = self.output_file.replace_dataset(
            'conditional_fluxes',
            shape=(iter_count, nstates, nstates),
            dtype=weight_dtype,
            chunks=(h5io.calc_chunksize(
                (iter_count, nstates,
                 nstates), weight_dtype) if self.do_compression else None),
            shuffle=self.do_compression,
            compression=9 if self.do_compression else None,
        )
        total_fluxes_ds = self.output_file.replace_dataset(
            'total_fluxes',
            shape=(iter_count, nstates),
            dtype=weight_dtype,
            chunks=(h5io.calc_chunksize(
                (iter_count,
                 nstates), weight_dtype) if self.do_compression else None),
            shuffle=self.do_compression,
            compression=9 if self.do_compression else None,
        )

        cond_arrival_counts_ds = self.output_file.replace_dataset(
            'conditional_arrivals',
            shape=(iter_count, nstates, nstates),
            dtype=np.uint,
            chunks=(h5io.calc_chunksize(
                (iter_count, nstates,
                 nstates), np.uint) if self.do_compression else None),
            shuffle=self.do_compression,
            compression=9 if self.do_compression else None,
        )
        arrival_counts_ds = self.output_file.replace_dataset(
            'arrivals',
            shape=(iter_count, nstates),
            dtype=np.uint,
            chunks=(h5io.calc_chunksize(
                (iter_count,
                 nstates), np.uint) if self.do_compression else None),
            shuffle=self.do_compression,
            compression=9 if self.do_compression else None,
        )

        # copy state labels for convenience
        self.output_file.replace_dataset(
            'state_labels', data=self.assignments_file['state_labels'][...])

        # Put nice labels on things
        for ds in (self.output_file, durations_count_ds, cond_fluxes_ds,
                   total_fluxes_ds):
            h5io.stamp_iter_range(ds, start_iter, stop_iter)

        # Calculate instantaneous rate matrices and trace trajectories
        last_state = None
        pi.new_operation('Tracing trajectories', iter_count)
        for iiter, n_iter in enumerate(range(start_iter, stop_iter)):
            # Get data from the main HDF5 file
            iter_group = self.data_reader.get_iter_group(n_iter)
            seg_index = iter_group['seg_index']
            nsegs, npts = iter_group['pcoord'].shape[0:2]
            weights = seg_index['weight']
            # parent_ids = seg_index['parent_id']
            parent_ids = self.data_reader.parent_id_dsspec.get_iter_data(
                n_iter)

            # Get bin and traj. ensemble assignments from the previously-generated assignments file
            assignment_iiter = h5io.get_iteration_entry(
                self.assignments_file, n_iter)
            bin_assignments = np.require(
                self.assignments_file['assignments'][assignment_iiter +
                                                     np.s_[:nsegs, :npts]],
                dtype=index_dtype)
            label_assignments = np.require(
                self.assignments_file['trajlabels'][assignment_iiter +
                                                    np.s_[:nsegs, :npts]],
                dtype=index_dtype)
            state_assignments = np.require(
                self.assignments_file['statelabels'][assignment_iiter +
                                                     np.s_[:nsegs, :npts]],
                dtype=index_dtype)

            # Prepare to run analysis
            cond_fluxes = np.zeros((nstates, nstates), weight_dtype)
            total_fluxes = np.zeros((nstates, ), weight_dtype)
            cond_counts = np.zeros((nstates, nstates), np.uint)
            total_counts = np.zeros((nstates, ), np.uint)
            durations = []

            # Estimate macrostate fluxes and calculate event durations using trajectory tracing
            # state is opaque to the find_macrostate_transitions function
            dt = 1.0 if npts == 1 else 1.0 / (npts - 1)
            state = _fast_transition_state_copy(iiter, nstates, parent_ids,
                                                last_state)
            find_macrostate_transitions(
                nstates,
                weights,
                label_assignments,
                state_assignments,
                dt,
                state,
                cond_fluxes,
                cond_counts,
                total_fluxes,
                total_counts,
                durations,
            )
            last_state = state

            # Store trace-based kinetics data
            cond_fluxes_ds[iiter] = cond_fluxes
            total_fluxes_ds[iiter] = total_fluxes
            arrival_counts_ds[iiter] = total_counts
            cond_arrival_counts_ds[iiter] = cond_counts

            durations_count_ds[iiter] = len(durations)
            if len(durations) > 0:
                durations_ds.resize(
                    (iter_count, max(len(durations), durations_ds.shape[1])))
                durations_ds[iiter, :len(durations)] = durations

            # Do a little manual clean-up to prevent memory explosion
            del iter_group, weights, parent_ids, bin_assignments, label_assignments, state, cond_fluxes, total_fluxes
            pi.progress += 1
Пример #3
0
    def go(self):
        assert self.data_reader.parent_id_dsspec._h5file is None
        assert self.data_reader.weight_dsspec._h5file is None
        if hasattr(self.dssynth.dsspec, '_h5file'):
            assert self.dssynth.dsspec._h5file is None
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi, self.data_reader, WESTPAH5File(
                self.output_filename, 'w',
                creating_program=True) as self.output_file:
            assign = self.binning.mapper.assign

            # We always assign the entire simulation, so that no trajectory appears to start
            # in a transition region that doesn't get initialized in one.
            iter_start = 1
            iter_stop = self.data_reader.current_iteration

            h5io.stamp_iter_range(self.output_file, iter_start, iter_stop)

            nbins = self.binning.mapper.nbins
            self.output_file.attrs['nbins'] = nbins

            state_map = np.empty((self.binning.mapper.nbins + 1, ),
                                 index_dtype)
            state_map[:] = 0  # state_id == nstates => unknown state

            # Recursive mappers produce a generator rather than a list of labels
            # so consume the entire generator into a list
            labels = [
                np.string_(label) for label in self.binning.mapper.labels
            ]

            self.output_file.create_dataset('bin_labels',
                                            data=labels,
                                            compression=9)

            if self.states:
                nstates = len(self.states)
                state_map[:] = nstates  # state_id == nstates => unknown state
                state_labels = [
                    np.string_(state['label']) for state in self.states
                ]

                for istate, sdict in enumerate(self.states):
                    assert state_labels[istate] == np.string_(
                        sdict['label'])  # sanity check
                    state_assignments = assign(sdict['coords'])
                    for assignment in state_assignments:
                        state_map[assignment] = istate
                self.output_file.create_dataset('state_map',
                                                data=state_map,
                                                compression=9,
                                                shuffle=True)
                self.output_file[
                    'state_labels'] = state_labels  # + ['(unknown)']
            else:
                nstates = 0
            self.output_file.attrs['nstates'] = nstates
            # Stamp if this has been subsampled.
            self.output_file.attrs['subsampled'] = self.subsample

            iter_count = iter_stop - iter_start
            nsegs = np.empty((iter_count, ), seg_id_dtype)
            npts = np.empty((iter_count, ), seg_id_dtype)

            # scan for largest number of segments and largest number of points
            pi.new_operation('Scanning for segment and point counts',
                             iter_stop - iter_start)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                iter_group = self.data_reader.get_iter_group(n_iter)
                nsegs[iiter], npts[iiter] = iter_group['pcoord'].shape[0:2]
                pi.progress += 1
                del iter_group

            pi.new_operation('Preparing output')

            # create datasets
            self.output_file.create_dataset('nsegs',
                                            data=nsegs,
                                            shuffle=True,
                                            compression=9)
            self.output_file.create_dataset('npts',
                                            data=npts,
                                            shuffle=True,
                                            compression=9)

            max_nsegs = nsegs.max()
            max_npts = npts.max()

            assignments_shape = (iter_count, max_nsegs, max_npts)
            assignments_dtype = np.min_scalar_type(nbins)
            assignments_ds = self.output_file.create_dataset(
                'assignments',
                dtype=assignments_dtype,
                shape=assignments_shape,
                compression=4,
                shuffle=True,
                chunks=h5io.calc_chunksize(assignments_shape,
                                           assignments_dtype),
                fillvalue=nbins,
            )
            if self.states:
                trajlabel_dtype = np.min_scalar_type(nstates)
                trajlabels_ds = self.output_file.create_dataset(
                    'trajlabels',
                    dtype=trajlabel_dtype,
                    shape=assignments_shape,
                    compression=4,
                    shuffle=True,
                    chunks=h5io.calc_chunksize(assignments_shape,
                                               trajlabel_dtype),
                    fillvalue=nstates,
                )
                statelabels_ds = self.output_file.create_dataset(
                    'statelabels',
                    dtype=trajlabel_dtype,
                    shape=assignments_shape,
                    compression=4,
                    shuffle=True,
                    chunks=h5io.calc_chunksize(assignments_shape,
                                               trajlabel_dtype),
                    fillvalue=nstates,
                )

            pops_shape = (iter_count, nstates + 1, nbins + 1)
            pops_ds = self.output_file.create_dataset(
                'labeled_populations',
                dtype=weight_dtype,
                shape=pops_shape,
                compression=4,
                shuffle=True,
                chunks=h5io.calc_chunksize(pops_shape, weight_dtype),
            )
            h5io.label_axes(
                pops_ds,
                [np.string_(i) for i in ['iteration', 'state', 'bin']])

            pi.new_operation('Assigning to bins', iter_stop - iter_start)
            last_labels = None  # mapping of seg_id to last macrostate inhabited
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                # get iteration info in this block

                if iiter == 0:
                    last_labels = np.empty((nsegs[iiter], ), index_dtype)
                    last_labels[:] = nstates  # unknown state

                # Slices this iteration into n_workers groups of segments, submits them to wm, splices results back together
                assignments, trajlabels, pops, statelabels = self.assign_iteration(
                    n_iter, nstates, nbins, state_map, last_labels)

                # Do stuff with this iteration's results

                last_labels = trajlabels[:, -1].copy()
                assignments_ds[iiter, 0:nsegs[iiter],
                               0:npts[iiter]] = assignments
                pops_ds[iiter] = pops
                if self.states:
                    trajlabels_ds[iiter, 0:nsegs[iiter],
                                  0:npts[iiter]] = trajlabels
                    statelabels_ds[iiter, 0:nsegs[iiter],
                                   0:npts[iiter]] = statelabels

                pi.progress += 1
                del assignments, trajlabels, pops, statelabels

            for dsname in 'assignments', 'npts', 'nsegs', 'labeled_populations', 'statelabels':
                h5io.stamp_iter_range(self.output_file[dsname], iter_start,
                                      iter_stop)
Пример #4
0
    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs',
                                                  dtype=np.uint,
                                                  shape=(iter_count, ))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype),
            shuffle=True,
            compression=9,
        )
        weights_ds = output_file.create_dataset(
            'weights',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=weight_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), weight_dtype),
            shuffle=True,
            compression=9,
        )

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            #             futures = set()
            #             for n_iter in xrange(iter_start,iter_stop):
            #                 futures.add(self.work_manager.submit(_find_matching_segments,
            #                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

            #             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(
                ((_find_matching_segments,
                  (self.data_reader.we_h5filename, n_iter, self.predicate,
                   self.invert), {})
                 for n_iter in range(iter_start, iter_stop)),
                    self.max_queue_len,
            ):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter - iter_start] = n_matches
                    matching_segs_ds[n_iter -
                                     iter_start, :n_matches] = matching_ids
                    weights_ds[n_iter - iter_start, :
                               n_matches] = self.data_reader.get_iter_group(
                                   n_iter)['seg_index']['weight'][sorted(
                                       matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments',
                                 extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in range(iter_stop - 1, iter_start - 1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(
                            matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter, :n_matches] = matching_ids
                        weights_ds[
                            iiter, :
                            n_matches] = self.data_reader.get_iter_group(
                                n_iter)['seg_index']['weight'][sorted(
                                    matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)[
                            'seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(
                            parent_id for parent_id in parent_ids
                            if parent_id >= 0)  # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1