예제 #1
0
class DataTransformation(object):
    """
    Args:
        source ([dict]): list of records
        config (Configuration): configuration to apply to each record
    """

    def __init__(self, source, config, single_process=False, processes=cpu_count()):
        if single_process:
            self.results = imap(transform, izip(source, repeat(config)))
        else:
            self.pool = Pool(processes)
            self.results = self.pool.imap(transform, izip(source, repeat(config)))

    def __iter__(self):
        return self

    def next(self):
        return self.results.next()

    def __del__(self):
        try:
            self.pool.close()
        except:
            pass
예제 #2
0
 def run_mcmc(self):
     complete = self.chains.check_completness()
     if not complete:
         for i in range(self.nchains):
             self.chains.chains.append(
                 Chain(self.chains.chains_filename, self.covfile, nchain=i, nsteps=self.nsteps))
         pool = Pool(processes=self.nchains)
         try:
             # Without the .get(9999), you can't interrupt this with Ctrl+C.
             pool.map_async(self.mcmc, self.chains.chains).get(999999)
             pool.close()
             pool.join()
             # to skip lines after the progress bars
             print '\n' * self.nchains
         except KeyboardInterrupt:
             pool.terminate()
     self.likelihood = self.chains.chains_to_likelihood()
     self.likelihood.stats(self.covfile)
     # [self.results[i].append(self.likelihood.pdfs[i].mean) for i in range(self.chains.dim)]
     # self.p = [self.likelihood.pdfs[i].mean for i in range(self.chains.dim)]
     self.p = self.chains.best_row_params
     self.simulation(self.spectrum.lambdas, *self.p)
     # [self.results_err[i].append([self.likelihood.pdfs[i].error_high,self.likelihood.pdfs[i].error_low]) for i in range(self.chains.dim)]
     # if(self.plot):
     self.likelihood.triangle_plots()
     self.plot_fit()
     # if convergence_test :
     self.chains.convergence_tests()
     return self.likelihood
예제 #3
0
파일: project.py 프로젝트: xdic/pyem
def main(args):
    dens = EMData(args.map)
    star = parse_star(args.input, keep_index=False)
    star[["ImageNumber", "ImageName"]] = star['rlnImageName'].str.split("@", expand=True)
    grouped = star.groupby("ImageName")
    pool = None
    if args.nproc > 1:
        pool = Pool(processes=args.nproc)
        results = pool.imap(lambda x: project_stack(x, dens, args.dest), (group for name, group in grouped))
    else:
        results = (project_stack(group, dens, args.dest) for name, group in grouped)
    i = 0
    t = 0
    for r in results:
        i += 1
        t += r
        sys.stdout.write("\rProjected %d particles in %d stacks" % (t, i))
        sys.stdout.flush()

    if pool is not None:
        pool.close()
        pool.join()

    sys.stdout.write('\n')
    sys.stdout.flush()

    return 0
예제 #4
0
 def multiproc_map(self, func):
     from pathos.multiprocessing import Pool
     pool = Pool()
     result = List(pool.map(func, self))
     pool.close()
     pool.join()
     return result
예제 #5
0
def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = Pool(processes=workers)


    result = pool.map(_apply_df, [(d, func, kwargs)
                                  for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
    def vw_train_and_test(self, options_list, data_file_paths):
        def init_worker():
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        def run_learner(options):
            train_options = {
                'final_regressor': os.path.join(self.work_dir, id_generator()),
                'data': data_file_paths['train'],
                'cache_file': data_file_paths['train'] + '.cache'
            }
            train_options.update(options)
            test_options = {
                'data': data_file_paths['test'],
                'predictions': os.path.join(self.work_dir, id_generator()),
                'cache_file': data_file_paths['test'] + '.cache'
            }
            test_options.update(options)

            # TO DO: remove below if.
            if 'kill_cache' in options:
                del train_options['kill_cache']
                del test_options['kill_cache']

            vw_wrapper = VW_Wrapper(verbose=False)
            vw_wrapper.train(train_options)
            predictions = vw_wrapper.test(test_options)
            os.remove(train_options['final_regressor'])
            os.remove(test_options['predictions'])
            return options, predictions

        if len(options_list) > 1:
            try:
                if not os.path.isfile(data_file_paths['test'] +
                                      '.cache') or not os.path.isfile(
                                          data_file_paths['train'] + '.cache'):
                    run_learner(options_list[0])
                pool = Pool(len(options_list), init_worker)
                result_list = pool.map_async(run_learner,
                                             options_list).get(99999999)
                pool.close()
                pool.join()
                return result_list
            except KeyboardInterrupt:
                print '  Keyboard Interrupt, exiting...) '
                pool.terminate()
                pool.join()
                sys.exit(0)

        elif len(options_list) == 1:
            return [run_learner(options_list[0])]
        else:
            return []
예제 #7
0
    def create_histograms(self):
        logger.debug('--->Systematics::create_histograms:')
        # collect ROOT objects
        self._root_objects_holder = RootObjects(self._output_file)

        if self._num_threads == 1:
            for systematic in self._systematics:
                logger.debug("---->Create ROOT objects for systematic %s.",
                             systematic.name)
                if logger.getEffectiveLevel() == 10: print '---->Systematics::create_histograms: systematic', systematic.process, systematic._process.estimation_method._friend_directories
                systematic.create_root_objects()
        else:
            logger.debug("Create ROOT objects for all systematics.")

            from pathos.multiprocessing import Pool
            pool = Pool(processes=self._num_threads)

            systematics_new = pool.map(systematic_create_root_objects,
                                       [s for s in self._systematics])
            pool.close()
            pool.join()
            del pool

            # Because the new objects have different addresses in memory,
            # the result objects have to be copied.
            for i_sys in range(len(systematics_new)):
                self._systematics[i_sys] = systematics_new[i_sys]
        logger.debug('-->Create root holders')
        for systematic in self._systematics:
            if self._find_unique_objects:
                self._root_objects_holder.add_unique(systematic.root_objects)
            else:
                self._root_objects_holder.add(systematic.root_objects)
        # self._root_objects_holder.check_duplicates() # TODO: Implement this if needed

        # produce ROOT objects (in parallel)
        logger.debug("Produce ROOT objects using the %s backend.",
                     self._backend)
        logger.debug('-->Produce root with' + self._backend + 'backend')
        if self._backend == "classic":
            self._root_objects_holder.produce_classic(self._num_threads)
        elif self._backend == "tdf":
            self._root_objects_holder.produce_tdf(self._num_threads)
        else:
            logger.fatal("Backend %s is not implemented.", self._backend)
            raise Exception

        # set duplicates to the produced ROOT objects
        logger.debug('--># set duplicates to the produced ROOT objects')
        if self._find_unique_objects:
            self._root_objects_holder.set_duplicates()
예제 #8
0
파일: struc2vec.py 프로젝트: Yelrose/PGL
    def calc_distances_between_nodes(self):
        """
        Use the dtw algorithm to calculate the distance between nodes. 
        """
        from fastdtw import fastdtw
        from pathos.multiprocessing import Pool
        # decide use which algo to use 
        if self.opt1 == True:
            self.distance_calc_func = self.distance_opt1_func
        else:
            self.distance_calc_func = self.distance_func

        dtws = []
        if self.opt2:
            depth = 0
            for node in self.nodes:
                if node in self.degree_list:
                    if depth in self.degree_list[node]:
                        degree = self.degree_list[node][depth]
                        if args.opt1:
                            degree = degree[0][0]
                        else:
                            degree = degree[0]
                    if degree not in self.degree2nodes:
                        self.degree2nodes[degree] = []
                    if node not in self.node2degree:
                        self.node2degree[node] = degree
                    self.degree2nodes[degree].append(node)
            # select the log(n) node to select data 
            degree_keys = self.degree2nodes.keys()
            degree_keys = np.array(list(degree_keys), dtype='int')
            self.degrees_sorted = list(np.sort(degree_keys))
            selected_nbh_nums = 2 * math.log(self.graph.num_nodes - 1, 2)
            self.selected_nbh_nums = selected_nbh_nums

            pool = Pool(10)
            dtws = pool.map(self.calc_node_with_neighbor_dtw_opt2, self.nodes)
            pool.close()
            pool.join()
        else:
            src_indices = range(0, self.graph.num_nodes - 2)

            pool = Pool(10)
            dtws = pool.map(self.calc_node_with_neighbor_dtw, src_indices)
            pool.close()
            pool.join()
        print('calc the dtw done.')
        for dtw in dtws:
            self.distance.update(dtw)
예제 #9
0
def solve_parallel(nSolns, T, Nc, v, nJobs=None, **kwargs):
    """
    Run solve() in parallel
    """
    if nJobs is None:
        nJobs = cpu_count()
    assert nJobs > 0

    def wrapped_solve(i):
        rng = np.random.RandomState()
        return solve(T, Nc, v, rng=rng, **kwargs)

    p = Pool(nJobs)
    Solutions = p.map(wrapped_solve, list(range(nSolns)))
    p.close()
    return Solutions
예제 #10
0
파일: preprocess.py 프로젝트: yakzan/ktext
def apply_parallel(data: List[Any], func: Callable) -> List[Any]:
    """
    Apply function to list of elements.

    Automatically determines the chunk size.
    """
    cpu_cores = cpu_count()

    try:
        chunk_size = ceil(len(data) / cpu_cores)
        pool = Pool(cpu_cores)
        transformed_data = pool.map(func,
                                    chunked(data, chunk_size),
                                    chunksize=1)
    finally:
        pool.close()
        pool.join()
        return transformed_data
def process_batch(df, directory, engine, num_cores = 12):
  
    p = Pool(num_cores)
  
    try:
      
        success = pd.Series(p.map(lambda i: write_indexes_to_table(df.loc[i, 'file_name'], df.loc[i, 'document'],\
                                df.loc[i, 'form_type'], directory, engine),\
                              range(df.shape[0])), ignore_index = True)
        p.close()
        
        num_success = success.sum()
        return(num_success)
      
    except Exception as e:
      
        print(e)
        p.close()
        return(None)
예제 #12
0
class DataFusion(object):
    """
    Args:
        source ([tuple]): list of tuples, each containing the records to merge
        config (Configuration): configuration to apply to each tuple
    """

    def __init__(self, source, config, processes=cpu_count()):
        self._pool = Pool(processes)
        self._results = self._pool.imap(_merge_records, izip(source, repeat(config)))

    def __iter__(self):
        return self

    def next(self):
        return self._results.next()

    def __del__(self):
        self._pool.close()
예제 #13
0
파일: struc2vec.py 프로젝트: Yelrose/PGL
    def random_walk_structual_sim(self):
        """
        According to struct distance to walk the path 
        """
        from pathos.multiprocessing import Pool
        print('start process struc2vec random walk.')
        walks_process_ids = [i for i in range(0, self.num_walks)]
        pool = Pool(10)
        walks = pool.map(self.executor_random_walk, walks_process_ids)
        pool.close()
        pool.join()

        #save the final walk result 
        file_result = open(args.tag + "_walk_path", "w")
        for walk in walks:
            for walk_node in walk:
                walk_node_str = " ".join([str(node) for node in walk_node])
                file_result.write(walk_node_str + "\n")
        file_result.close()
        print('process struc2vec random walk done.')
예제 #14
0
    def calculate_centroids(self, p=None):
        """
        Perform integration to find centroid at all turns up to N. Multiprocessing pool used to calculate independent
        turn values.
        Will automatically use `integrate_first_order` or `integrate_second_order` if appropriate.
        Args:
            p: Specify number of processes for pool. If not given then `cpu_count` is used.

        Returns:
            array of floats
        """
        if p:
            pool_size = p
        else:
            pool_size = cpu_count()

        pool = Pool(pool_size)

        #  attempt to speed things up by spreading out difficult integration values at the end of range
        #  appeared to not work
        #     x = []
        #     for i in range(cpu_count()):
        #         x += range(N)[i::4]

        if len(self.mu) == 1:
            integration_function = self.integrate_first_order
        elif len(self.mu) == 2:
            integration_function = self.integrate_second_order
        else:
            integration_function = self.integrate_any_order

        x = range(self.N)
        results = pool.map(integration_function, x)
        pool.close()

        return results
예제 #15
0
    def calculate_centroids(self, p=None):
        """
        Perform integration to find centroid at all turns up to N. Multiprocessing pool used to calculate independent
        turn values.
        Will automatically use `integrate_first_order` or `integrate_second_order` if appropriate.
        Args:
            p: Specify number of processes for pool. If not given then `cpu_count` is used.

        Returns:
            array of floats
        """
        if p:
            pool_size = p
        else:
            pool_size = cpu_count()

        pool = Pool(pool_size)

        #  attempt to speed things up by spreading out difficult integration values at the end of range
        #  appeared to not work
        #     x = []
        #     for i in range(cpu_count()):
        #         x += range(N)[i::4]

        if len(self.mu) == 1:
            integration_function = self.integrate_first_order
        elif len(self.mu) == 2:
            integration_function = self.integrate_second_order
        else:
            integration_function = self.integrate_any_order

        x = range(self.N)
        results = pool.map(integration_function, x)
        pool.close()

        return results
예제 #16
0
    def learnMulti(self, gn, top_n=5):
        """
		Learn using multiprocessing

		gn: initial grammar
		top_n: save the best top_n grammars
		"""

        # settings
        max_process = cpu_count(
        )  # processes to run in parallel; or use cpu_count()

        pool = Pool(max_process)
        P_LOCK = Manager().Lock()  # mutex lock for printing
        gidCounter = 0
        results = []
        gList = [gn]
        gnBest = [deepcopy(gn)]
        bestmdl_last = gn.mdl  # best MDL on the upper level nodes

        # uncomment this for debugging
        #history_pri_lik= set() # history of prior & likelihood values

        while len(gList) > 0:
            gn = gList.pop(0)  # grammar node to be expanded
            self.printMsg(
                1, '>> gList size: %d, bestMDL: %.3f (#%d)' %
                (len(gList), gnBest[0].mdl, gnBest[0].gid))

            # substitute if possible
            ntlist = self.getFirstDLThack(gn.dlt)
            if len(ntlist) > 0:
                self.printMsg(2,
                              '>> Possible substitutions on #%d:' % (gn.gid),
                              ntlist)
                while len(ntlist) > 0:
                    gidCounter += 1
                    argList = [gn, ntlist.pop(), gidCounter, P_LOCK]
                    results.append(
                        pool.apply_async(self.substituteMulti, [argList]))
            else:
                self.printMsg(
                    2, '>> No more SUBSTITUTE possible on #%d\n' % gn.gid)

            # merge if possible
            ntlist = self.mergeSet(gn.g)
            if len(ntlist) > 0:
                self.printMsg(2, '>> Possible merges on #%d:' % (gn.gid),
                              ntlist)
                while len(ntlist) > 0:
                    gidCounter += 1
                    argList = [gn, ntlist.pop(), gidCounter, P_LOCK]
                    results.append(pool.apply_async(self.mergeMulti,
                                                    [argList]))
            else:
                self.printMsg(2,
                              '>> No more MERGE possible on #%d.\n' % gn.gid)

            del gn
            delList = []
            bestmdl = bestmdl_last

            # search next level in the search tree
            while len(results) >= 1:
                time.sleep(0.001)  # avoid wasting resource
                for r in range(len(results)):
                    if results[r].ready():
                        delList.append(r)
                        gn_new = results[r].get()

                        # uncomment for debugging
                        #if gn_new.lik < self.max_mdl:
                        #	history_pri_lik.add((gn_new.pri,gn_new.lik))

                        # save the best-N grammars
                        if len(gnBest) < top_n:
                            gnBest.append(gn_new)
                            gnBest = sorted(gnBest, key=lambda gn: gn.mdl)
                        elif gn_new.mdl < gnBest[-1].mdl:
                            del gnBest[-1]
                            gnBest.append(deepcopy(gn_new))
                            gnBest = sorted(gnBest, key=lambda gn: gn.mdl)

                        # save this level's best mdl
                        if gn_new.mdl < bestmdl:
                            bestmdl = gn_new.mdl

                        # beam search: compare with the best mdl on the upper level
                        if gn_new.mdl >= bestmdl_last:
                            gn_new.worse += 1
                        if gn_new.worse < BEAMSIZE:
                            gList.append(gn_new)
                        else:
                            del gn_new

                delList.sort()
                for d in range(len(delList)):
                    del results[delList.pop()]
            bestmdl_last = bestmdl

        pool.close()
        pool.join()

        return gnBest
예제 #17
0
def main(options):
    """
    Projection subtraction program entry point.
    :param options: Command-line arguments parsed by ArgumentParser.parse_args()
    :return: Exit status
    """
    rchop = lambda x, y: x if not x.endswith(y) or len(y) == 0 else x[:-len(y)]
    options.output = rchop(options.output, ".star")
    options.suffix = rchop(options.suffix, ".mrc")
    options.suffix = rchop(options.suffix, ".mrcs")

    star = StarFile(options.input)
    npart = len(star['rlnImageName'])

    sub_dens = EMData(options.submap)

    if options.wholemap is not None:
        dens = EMData(options.wholemap)
    else:
        print "Reference map is required."
        return 1

    # Write star header for output.star.
    top_header = "\ndata_\n\nloop_\n"
    headings = star.keys()
    output_star = open("{0}.star".format(options.output), 'w')

    output_star.write(top_header)
    for i, heading in enumerate(headings):
        output_star.write("_{0} #{1}\n".format(heading, i + 1))

    if options.recenter:  # Compute difference vector between new and old mass centers.
        if options.wholemap is None:
            print "Reference map required for recentering."
            return 1

        new_dens = dens - sub_dens
        # Note the sign of the shift in coordinate frame is opposite the shift in the CoM.
        recenter = Vec3f(*dens.phase_cog()[:3]) - Vec3f(
            *new_dens.phase_cog()[:3])
    else:
        recenter = None

    pool = None
    if options.nproc > 1:  # Compute subtraction in parallel.
        pool = Pool(processes=options.nproc)
        results = pool.imap(
            lambda x: subtract(x,
                               dens,
                               sub_dens,
                               recenter=recenter,
                               no_frc=options.no_frc,
                               low_cutoff=options.low_cutoff,
                               high_cutoff=options.high_cutoff),
            particles(star),
            chunksize=min(npart / options.nproc, options.maxchunk))
    else:  # Use serial generator.
        results = (subtract(x,
                            dens,
                            sub_dens,
                            recenter=recenter,
                            no_frc=options.no_frc,
                            low_cutoff=options.low_cutoff,
                            high_cutoff=options.high_cutoff)
                   for x in particles(star))

    # Write subtraction results to .mrcs and .star files.
    i = 0
    nfile = 1
    starpath = None
    mrcs = None
    mrcs_orig = None
    for r in results:
        if i % options.maxpart == 0:
            mrcsuffix = options.suffix + "_%d" % nfile
            nfile += 1
            starpath = "{0}.mrcs".format(
                os.path.sep.join(
                    os.path.relpath(mrcsuffix,
                                    options.output).split(os.path.sep)[1:]))
            mrcs = "{0}.mrcs".format(mrcsuffix)
            mrcs_orig = "{0}_original.mrcs".format(mrcsuffix)
            if os.path.exists(mrcs):
                os.remove(mrcs)
            if os.path.exists(mrcs_orig):
                os.remove(mrcs_orig)

        r.ptcl_norm_sub.append_image(mrcs)

        if options.original:
            r.ptcl.append_image(mrcs_orig)

        if logger.getEffectiveLevel(
        ) == logging.DEBUG:  # Write additional debug output.
            ptcl_sub_img = r.ptcl.process("math.sub.optimal", {
                "ref": r.ctfproj,
                "actual": r.ctfproj_sub,
                "return_subim": True
            })
            ptcl_lowpass = r.ptcl.process("filter.lowpass.gauss", {
                "apix": 1.22,
                "cutoff_freq": 0.05
            })
            ptcl_sub_lowpass = r.ptcl_norm_sub.process("filter.lowpass.gauss",
                                                       {
                                                           "apix": 1.22,
                                                           "cutoff_freq": 0.05
                                                       })
            ptcl_sub_img.write_image("poreclass_subimg.mrcs", -1)
            ptcl_lowpass.write_image("poreclass_lowpass.mrcs", -1)
            ptcl_sub_lowpass.write_image("poreclass_sublowpass.mrcs", -1)
            r.ctfproj.write_image("poreclass_ctfproj.mrcs", -1)
            r.ctfproj_sub.write_image("poreclass_ctfprojsub.mrcs", -1)

        assert r.meta.i == i  # Assert particle order is preserved.
        star['rlnImageName'][i] = "{0:06d}@{1}".format(
            i % options.maxpart + 1, starpath)  # Set new image name.
        r.meta.update(star)  # Update StarFile with altered fields.
        line = '  '.join(str(star[key][i]) for key in headings)
        output_star.write("{0}\n".format(line))
        i += 1

    output_star.close()

    if pool is not None:
        pool.close()
        pool.join()

    return 0
예제 #18
0
def pool_processor(func, iter):
    pool = Pool(cpu_count())
    pool.map(func, iter)
    # pool.join()
    pool.close()
예제 #19
0
    def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None,
                           grid_resolution=30, n_jobs=-1, grid_range=None, sample=True,
                           sampling_strategy='random-choice', n_samples=1000,
                           bin_count=50, samples_per_bin=10, return_metadata=False):

        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        return_metadata: boolean

        :Example:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. " \
                                           "Please call Interpretation.load_data " \
                                           "before running this method."
            raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be" \
                      "members of modelinstance.classes." \
                      "Expected members of: " \
                      "{0}\n" \
                      "got: " \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            assert all([i in modelinstance.target_names for i in filter_classes]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one "
                                        "creating one with skater.model.local.InMemoryModel or"
                                        "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot '
                                            'be None'))
            self.interpreter.logger.warn("Classifiers with probability scores can be explained "
                                         "more granularly than those without scores. If a prediction method with "
                                         "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in " \
                                         "Interpretation.data_set.feature_ids" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise(KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(grid_range)
                raise(exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples_from_dataset=10)
            examples = DataManager(examples, feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape))
        self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples_from_dataset=n_samples,
                                                    samples_per_bin=samples_per_bin,
                                                    bin_count=bin_count)

        _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids, self.data_set.feature_ids, filter_classes)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]
        executor_instance = Pool(n_jobs)
        try:
            pd_list = executor_instance.map(pd_func, arg_list)
        except:
            self.interpreter.logger.debug("Multiprocessing failed, going single process")
            pd_list = map(pd_func, arg_list)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()
        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))
예제 #20
0
def estimate_param_scan(estimator,
                        X,
                        param_sets,
                        evaluate=None,
                        evaluate_args=None,
                        failfast=True,
                        return_estimators=False,
                        n_jobs=1,
                        progress_reporter=None,
                        show_progress=True,
                        return_exceptions=False):
    """ Runs multiple estimations using a list of parameter settings

    Parameters
    ----------
    estimator : Estimator object or class
        An estimator object that provides an estimate(X, **params) function.
        If only a class is provided here, the Estimator objects will be
        constructed with default parameter settings, and the parameter settings
        from param_sets for each estimation. If you want to specify other
        parameter settings for those parameters not specified in param_sets,
        construct an Estimator before and pass the object.

    param_sets : iterable over dictionaries
        An iterable that provides parameter settings. Each element defines a
        parameter set, for which an estimation will be run using these
        parameters in estimate(X, **params). All other parameter settings will
        be taken from the default settings in the estimator object.

    evaluate : str or list of str, optional
        The given methods or properties will be called on the estimated
        models, and their results will be returned instead of the full models.
        This may be useful for reducing memory overhead.

    evaluate_args: iterable of iterable, optional
        Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate.

    failfast : bool
        If True, will raise an exception when estimation failed with an exception
        or trying to calls a method that doesn't exist. If False, will simply
        return None in these cases.

    return_estimators: bool
        If True, return a list estimators in addition to the models.

    show_progress: bool
        if the given estimator supports show_progress interface, we set the flag
        prior doing estimations.

    return_exceptions: bool, default=False
        if failfast is False while this setting is True, returns the exception thrown at the actual grid element,
        instead of None.

    Returns
    -------
    models : list of model objects or evaluated function values
        A list of estimated models in the same order as param_sets. If evaluate
        is given, each element will contain the results from these method
        evaluations.

    estimators (optional) : list of estimator objects. These are returned only
        if return_estimators=True

    Examples
    --------

    Estimate a maximum likelihood Markov model at lag times 1, 2, 3.

    >>> from pyemma.msm.estimators import MaximumLikelihoodMSM, BayesianMSM
    >>>
    >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2]  # mini-trajectory
    >>> param_sets=param_grid({'lag': [1,2,3]})
    >>>
    >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales')
    [array([ 1.24113168,  0.77454377]), array([ 2.65266698,  1.42909842]), array([ 5.34810405,  1.14784446])]

    Now we also want to get samples of the timescales using the BayesianMSM.
    >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False,
    ...     evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP
    [[array([ 1.24113168,  0.77454377]), None], [array([ 2.48226337,  1.54908754]), None], [array([ 3.72339505,  2.32363131]), None]]

    We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example
    a Bayesian estimator for that.

    Now we also want to get samples of the timescales using the BayesianMSM.
    >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False,
    ...     evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP
    [[array([ 1.24357685,  0.77609028]), [array([ 1.5963252 ,  0.73877883]), array([ 1.29915847,  0.49004912]), array([ 0.90058583,  0.73841786]), ... ]]

    """
    # make sure we have an estimator object
    estimator = get_estimator(estimator)
    if hasattr(estimator, 'show_progress'):
        estimator.show_progress = show_progress

    # if we want to return estimators, make clones. Otherwise just copy references.
    # For parallel processing we always need clones.
    # Also if the Estimator is its own Model, we have to clone.
    from pyemma._base.model import Model
    if (return_estimators or n_jobs > 1 or n_jobs is None
            or isinstance(estimator, Model)):
        estimators = [clone_estimator(estimator) for _ in param_sets]
    else:
        estimators = [estimator for _ in param_sets]

    # if we evaluate, make sure we have a list of functions to evaluate
    if _types.is_string(evaluate):
        evaluate = [evaluate]
    if _types.is_string(evaluate_args):
        evaluate_args = [evaluate_args]

    if evaluate is not None and evaluate_args is not None and len(
            evaluate) != len(evaluate_args):
        raise ValueError(
            "length mismatch: evaluate ({}) and evaluate_args ({})".format(
                len(evaluate), len(evaluate_args)))

    show_progress = progress_reporter is not None and show_progress
    if show_progress:
        progress_reporter._progress_register(len(estimators),
                                             stage=0,
                                             description="estimating %s" %
                                             str(estimator.__class__.__name__))

    if n_jobs > 1 and os.name == 'posix':
        if hasattr(estimators[0], 'logger'):
            estimators[0].logger.debug('estimating %s with n_jobs=%s',
                                       estimator, n_jobs)
        # iterate over parameter settings
        task_iter = ((estimator, param_set, X, evaluate, evaluate_args,
                      failfast, return_exceptions)
                     for estimator, param_set in zip(estimators, param_sets))

        from pathos.multiprocessing import Pool as Parallel
        pool = Parallel(processes=n_jobs)
        args = list(task_iter)
        if show_progress:
            from pyemma._base.model import SampledModel
            for a in args:
                if isinstance(a[0], SampledModel):
                    a[0].show_progress = False

            def callback(_):
                progress_reporter._progress_update(1, stage=0)
        else:
            callback = None

        import six
        if six.PY3:

            def error_callback(*args, **kw):
                if failfast:
                    raise Exception('something failed')

            with pool:
                res_async = [
                    pool.apply_async(_estimate_param_scan_worker,
                                     a,
                                     callback=callback,
                                     error_callback=error_callback)
                    for a in args
                ]
                res = [x.get() for x in res_async]
        else:
            try:
                res_async = [
                    pool.apply_async(_estimate_param_scan_worker,
                                     a,
                                     callback=callback) for a in args
                ]
                res = [x.get() for x in res_async]
            finally:
                pool.close()

    # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator
    else:
        if hasattr(estimators[0], 'logger'):
            estimators[0].logger.debug(
                'estimating %s with n_jobs=1 because of the setting or '
                'you not have a POSIX system', estimator)
        res = []
        if show_progress:
            from pyemma._base.model import SampledModel
            if isinstance(estimator, SampledModel):
                for e in estimators:
                    e.show_progress = False

        for estimator, param_set in zip(estimators, param_sets):
            res.append(
                _estimate_param_scan_worker(estimator, param_set, X, evaluate,
                                            evaluate_args, failfast,
                                            return_exceptions))
            if show_progress:
                progress_reporter._progress_update(1, stage=0)

    if show_progress:
        progress_reporter._progress_force_finish(0)

    # done
    if return_estimators:
        return res, estimators
    else:
        return res
net = ScaleSpecificNetwork(fname,
                           'air',
                           date(1950, 1, 1),
                           date(2016, 1, 1),
                           None,
                           None,
                           None,
                           'monthly',
                           anom=False)
pool = Pool(20)
net.wavelet(1, 'y', pool=pool, cut=1)
net.get_continuous_phase(pool=pool)
print "wavelet done"
net.get_phase_fluctuations(rewrite=True, pool=pool)
print "fluctuations done"
pool.close()
pool.join()
net.phase_fluctuations -= np.nanmean(net.phase_fluctuations, axis=0)
net.get_adjacency_matrix(net.phase_fluctuations,
                         method="L2",
                         pool=None,
                         use_queue=True,
                         num_workers=20)
net.save_net('networks/NCEP-SATannual-phase-fluctuations-adjmatL2.bin',
             only_matrix=True)
print "L2 done"

## PHASE FLUCTUATIONS NETWORK correlation
print "Computing MI knn..."
net = ScaleSpecificNetwork(fname,
                           'air',
예제 #22
0
        
    return i, j, phase_hilb_rc


pool = Pool(NUM_WORKERS)
net.wavelet(1, 'y', pool = pool, cut = 1)

# Hilbert on RC SSA fluctuations
# args = [ (i, j, net.data[:, i, j]) for i in range(net.lats.shape[0]) for j in range(net.lons.shape[0]) ]
# results = pool.map(_hilbert_ssa, args)
# for i, j, res in results:
#     net.phase[:, i, j] = res

net.get_continuous_phase(pool = pool)
net.get_phase_fluctuations(rewrite = True, pool = pool)
pool.close()
pool.join()

# index_correlations = {}
# index_datas = {}

# # SURROGATES
# for index, ndx_type, start_date, end_year in zip(INDICES, DATE_TYPE, START_DATES, END_YEARS):
    # load index
    # print index

    # if index != 'NINO3.4':
index_data = DataField()
raw = np.loadtxt("%sNAO.station.monthly.1865-2016.txt" % (path_to_data))
raw = raw[:, 1:]
index_data.data = raw.reshape(-1)
예제 #23
0
def calcsapvael(vertices, faces, wantsapv=True, wantael=True):
    '''
    def calcsapvael(vertices,faces,sapv=True,ael=True):

    calculate surface area per vertex <sapv> and average edge length <ael> for all
    input vertices

    Input:
        <vertices>: vertices array (nVerts, 3)
        <faces>: faces array (nFaces, 3)
        <wantsapv>: bool, want to calculate sapv, default:True
        <wantael>: bool, want to calculate ael, default:True
    Output:
        <sapv>: 1d (nVerts,) array
        <ael>: 1d (nVerts,) array

    at least <wantsapv> or <wantael> should be true.

    if both <wantsapv> and <wantael> are true, we return (sapv, ael). Otherwise only
    return <sapv> or <ael>

    # can test this speed
    from time import time
    t = time()
    sapv = rz.mri.calcsapvael(verts,faces,wantael=False)
    print(time()-t)

    History:
        20180717 RZ test the speed, for a surface with 160000 vertices, it take
            ~250s(so slow..). RZ also confirm the this function output correct values
            by comparing results to vertex_area.m and vertex_neighbour_distance.m
        20180714 RZ created it

    To do:
        1. maybe test the speed??
        2. use parallel computing??
    '''
    from pathos.multiprocessing import Pool
    from numpy import where, any, sqrt, abs, array, unique
    from time import time

    # check input
    assert wantsapv is True or wantael is True, 'at least you want to compute sapv or ael'

    def sapvael(i_vert):
        '''
        <i_vert> is the vertex index, start from 0
        '''
        thisvert = vertices[i_vert, :]
        # find the faces that contain this vertices

        faces_surnd_idx = where(any(faces == i_vert,
                                    axis=1))[0]  # here can use reduce??
        faces_surnd = faces[faces_surnd_idx, :]
        vertices_surnd_idx = unique(
            faces_surnd.flatten()[(faces_surnd != i_vert).flatten()])
        vertices_surnd = vertices[vertices_surnd_idx, :]

        # calculate the ael
        if wantael:
            ael_thisvert = (sqrt(\
                (vertices_surnd[:,0] - thisvert[0]) ** 2 + \
                (vertices_surnd[:,1] - thisvert[1]) ** 2 + \
                (vertices_surnd[:,2] - thisvert[2]) ** 2)).mean()

        # calculate the face area, for each face, assume vertex A,B,C
        # the face area is the vecto
        if wantsapv:
            # compute three edges
            el1 = sqrt(((vertices[faces_surnd[:, 0], :] -
                         vertices[faces_surnd[:, 1], :])**2).sum(axis=1))
            el2 = sqrt(((vertices[faces_surnd[:, 1], :] -
                         vertices[faces_surnd[:, 2], :])**2).sum(axis=1))
            el3 = sqrt(((vertices[faces_surnd[:, 2], :] -
                         vertices[faces_surnd[:, 0], :])**2).sum(axis=1))
            s = (el1 + el2 + el3) / 2
            sapv_thisvert = sqrt(s * (s - el1) * (s - el2) * (s - el3)).mean()

        if wantsapv and wantael:
            return sapv_thisvert, ael_thisvert
        elif not wantsapv and wantael:
            return ael_thisvert
        elif wantsapv and not wantael:
            return sapv_thisvert

    # will switch to parallel computer if needed
    # currently we use map
    print('Compute sapv and/or ael...might take a while....\n')
    p = Pool()
    #b = p.map(sapvael, range(200), chunksize=100) # debug, testing purpose
    b = p.map(sapvael,
              range(vertices.shape[0]),
              chunksize=(vertices.shape[0] // 10))  # this is for speed testing
    p.close()
    if wantsapv and wantael:
        # this solution seems slower...
        #for i,el in enumerator(b):
        #    sapv[i]=el[0]
        #    ael[i]=el[1]

        # this is memory intensive, but faster
        sapv = array([i[0] for i in b])
        ael = array([i[1] for i in b])
        return sapv, ael
    else:
        return array(b)