예제 #1
0
def multiprocess(func, parameters, num_workers=None, context=None):
    """
    Run the function with the parameters in parallel using multiprocessing.

    ``context`` is one of ``{"fork", "spawn", "forkserver"}``.  For
    dask<2.16.0,the default context is "fork" and for dask>=2.16.0, the default
    is "spawn".
    """
    bag = dask.bag.from_sequence(parameters)

    config = {'scheduler': 'processes'}
    if context is not None:
        config['multiprocessing.context'] = context
    elif hasattr(parameters[0], 'multiprocessing_context'):
        config['multiprocessing.context'] = \
            parameters[0].multiprocessing_context

    with dask.config.set(config):
        if num_workers:
            results = bag.map(func).compute(num_workers=num_workers)
        elif hasattr(parameters[0], 'num_workers'):
            results = bag.map(func).compute(
                num_workers=parameters[0].num_workers)
        else:
            # num of workers is defaulted to the number of logical processes
            results = bag.map(func).compute()

        return results
def map_wrapper(function_item,list_items,other_args=None):
    from dask.distributed import Client
    import dask.bag as db
    c = Client()
    NCORES = len(c.ncores().values())-2
    b0 = db.from_sequence(list_items, npartitions=NCORES)
    if other_args is not None:
        list_items = list(db.map(function_item,b0,other_args).compute())
    else:
        list_items = list(db.map(function_item,b0).compute())
    return list_items
예제 #3
0
def multiprocess(func, parameters, num_workers=None):
    """Run the function with the parameters in parallel using multiprocessing."""
    bag = dask.bag.from_sequence(parameters)

    with dask.set_options(get=dask.multiprocessing.get):
        if num_workers:
            results = bag.map(func).compute(num_workers=num_workers)
        elif hasattr(parameters[0], 'num_workers'):
            results = bag.map(func).compute(num_workers=parameters[0].num_workers)
        else:
            # num of workers is defaulted to the number of logical processes
            results = bag.map(func).compute()

        return results
예제 #4
0
def create_index_dependend_grouped_residual(scheme, parameter, problem_bag,
                                            constraint_labels_and_matrices,
                                            residual_function):
    def penalty_function(problem, labels_and_matrices):

        clp, residual = residual_function(labels_and_matrices.matrix,
                                          problem.data)

        penalty = residual
        if callable(scheme.model.has_additional_penalty_function):
            if scheme.model.has_additional_penalty_function():
                additional_penalty = scheme.model.additional_penalty_function(
                    parameter, labels_and_matrices.clp_label, clp,
                    problem.index)
                penalty = np.concatenate([penalty, additional_penalty])
        return clp, residual, penalty
    penalty_bag = \
        db.map(penalty_function, problem_bag, constraint_labels_and_matrices)

    reduced_clp_labels = constraint_labels_and_matrices.pluck(0)
    reduced_clps = penalty_bag.pluck(0)
    residuals = penalty_bag.pluck(1)
    penalty = dask.delayed(np.concatenate)(penalty_bag.pluck(2))

    return reduced_clp_labels, reduced_clps, residuals, penalty
예제 #5
0
def test_01a_compute_score(dtcpop):
    from neuronunit.optimization import get_neab
    from neuronunit.optimization.optimization_management import dtc_to_rheo
    from neuronunit.optimization.optimization_management import nunit_evaluation
    from neuronunit.optimization.optimization_management import format_test
    #dtcpop = grid_points()
    dtclist = list(map(dtc_to_rheo, dtcpop))
    for d in dtclist:
        assert len(list(d.attrs.values())) > 0
    import dask.bag as db
    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(format_test, b0).compute())

    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(nunit_evaluation, b0).compute())
    return dtclist
예제 #6
0
def main():  # noqa: D103

    bgen_dir = f'{ukb}/array_imputed'
    output_dir = f'{bgen_dir}/output'

    assert os.path.exists(bgen_dir)
    assert os.path.exists(output_dir)

    # why not retry? nothing else I can do
    dask.config.set({'distributed.scheduler.allowed-failures': 99})

    # Maximum of 10 concurrent downloads per application
    # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644
    client = dask.distributed.Client(
        n_workers=10,
        local_directory="/oasis/tscc/scratch/jmargoli"
    )

    jobs = []
    # calculate number of download batches
    for chrom in range(1,23):
        jobs.append((ukb, chrom, bgen_dir))

    print(f"Number of jobs queued: {len(jobs)}", flush = True)

    bag = dask.bag.from_sequence(jobs)
    downloads = bag.map(download_item)

    client.compute(downloads, retries=99).result() # wait for the result so
예제 #7
0
def test_01a_compute_score(dtcpop):
    from neuronunit.optimization import get_neab
    from neuronunit.optimization.optimization_management import dtc_to_rheo
    from neuronunit.optimization.optimization_management import nunit_evaluation
    from neuronunit.optimization.optimization_management import format_test
    #dtcpop = grid_points()
    dtclist = list(map(dtc_to_rheo,dtcpop))
    for d in dtclist:
        assert len(list(d.attrs.values())) > 0
    import dask.bag as db
    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(format_test,b0).compute())

    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(nunit_evaluation,b0).compute())
    return dtclist
예제 #8
0
        def find_rheobase(self, dtc):

            assert os.path.isfile(
                dtc.model_path), "%s is not a file" % dtc.model_path
            # If this it not the first pass/ first generation
            # then assume the rheobase value found before mutation still holds until proven otherwise.
            # dtc = check_current(model.rheobase,dtc)
            # If its not true enter a search, with ranges informed by memory
            cnt = 0
            while dtc.boolean == False:

                #dtc.current_steps = list(filter(lambda cs: cs !=0.0 , dtc.current_steps))
                dtc_clones = [
                    copy.copy(dtc) for i in range(0, len(dtc.current_steps))
                ]
                for i, s in enumerate(dtc.current_steps):
                    dtc_clones[i].ampl = None
                    dtc_clones[i].ampl = dtc.current_steps[i]
                b0 = db.from_sequence(dtc_clones, npartitions=8)
                #dtc_clone = list(map(check_current,dtc_clones))
                dtc_clone = list(db.map(check_current, b0).compute())

                for d in dtc_clone:
                    dtc.lookup.update(d.lookup)
                #print(dtc.lookup)
                dtc = check_fix_range(dtc)
                cnt += 1
                print(cnt, 'cnt value')
                #print(type(dtc.current_steps))
                #print(dtc.current_steps,'this stays small')
            return dtc
예제 #9
0
def create_index_independend_grouped_residual(scheme, parameter, problem_bag,
                                              constraint_labels_and_matrices,
                                              residual_function):

    matrix_labels = problem_bag.pluck(1)\
        .map(lambda group: "".join(problem.dataset for problem in group))\

    def penalty_function(matrix_label, problem, labels_and_matrices):

        clp, residual = residual_function(
            labels_and_matrices[matrix_label].matrix, problem.data)

        penalty = residual
        if callable(scheme.model.has_additional_penalty_function):
            if scheme.model.has_additional_penalty_function():
                additional_penalty = scheme.model.additional_penalty_function(
                    parameter, labels_and_matrices[matrix_label].clp_label,
                    clp, problem.index)
                penalty = np.concatenate([penalty, additional_penalty])
        return clp, residual, penalty
    penalty_bag = \
        db.map(penalty_function, matrix_labels, problem_bag, constraint_labels_and_matrices)

    reduced_clp_label = {
        label: constraint_labels_and_matrices[label].clp_label
        for label in constraint_labels_and_matrices
    }
    reduced_clps = penalty_bag.pluck(0)
    residuals = penalty_bag.pluck(1)
    penalty = dask.delayed(np.concatenate)(penalty_bag.pluck(2))

    return reduced_clp_label, reduced_clps, residuals, penalty
예제 #10
0
    def test_grid_dimensions(self):
        from neuronunit.optimization.model_parameters import model_params
        provided_keys = list(model_params.keys())
        USE_CACHED_GS = False
        from neuronunit.optimization import exhaustive_search
        from neuronunit.optimization.optimization_management import map_wrapper
        import dask.bag as db
        npoints = 2
        nparams = 3
        for i in range(1,10):
            for j in range(1,10):
                grid_points = exhaustive_search.create_grid(npoints = i, nparams = j)
                b0 = db.from_sequence(grid_points[0:2], npartitions=8)
                dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute())
                self.assertEqual(i*j,len(dtcpop))
                self.assertNotEqual(dtcpop,None)
                dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2])
                self.assertNotEqual(dtcpop_compare,None)
                self.assertEqual(len(dtcpop_compare),len(dtcpop))
                for i,j in enumerate(dtcpop):
                    for k,v in dtcpop_compare[i].attrs.items():
                        print(k,v,i,j)
                        self.assertEqual(j.attrs[k],v)

        return True
예제 #11
0
    def test_grid_dimensions(self):
        from neuronunit.optimization.model_parameters import model_params
        provided_keys = list(model_params.keys())
        USE_CACHED_GS = False
        from neuronunit.optimization import exhaustive_search
        from neuronunit.optimization.optimization_management import map_wrapper
        import dask.bag as db
        npoints = 2
        nparams = 3
        for i in range(1, 10):
            for j in range(1, 10):
                grid_points = exhaustive_search.create_grid(npoints=i,
                                                            nparams=j)
                b0 = db.from_sequence(grid_points[0:2], npartitions=8)
                dtcpop = list(
                    db.map(exhaustive_search.update_dtc_grid, b0).compute())
                self.assertEqual(i * j, len(dtcpop))
                self.assertNotEqual(dtcpop, None)
                dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,
                                             grid_points[0:2])
                self.assertNotEqual(dtcpop_compare, None)
                self.assertEqual(len(dtcpop_compare), len(dtcpop))
                for i, j in enumerate(dtcpop):
                    for k, v in dtcpop_compare[i].attrs.items():
                        print(k, v, i, j)
                        self.assertEqual(j.attrs[k], v)

        return True
예제 #12
0
def map_wrapper(function_item, list_items):
    from dask.distributed import Client
    import dask.bag as db
    c = Client()
    NCORES = len(c.ncores().values())
    b0 = db.from_sequence(list_items, npartitions=NCORES)
    list_items = list(db.map(function_item, b0).compute())
    return list_items
예제 #13
0
 def calculate_descriptors(self,
                           molecules: List[Molecule]) -> List[Molecule]:
     molecules = bag.map(self.descriptor,
                         bag.from_sequence(molecules)).compute()
     molecules = [
         molecule for molecule in molecules
         if all(1.0 > property > 0.0 for property in molecule.descriptor)
     ]
     return molecules
예제 #14
0
 def cas(self):
     grid = db.from_sequence(self.files, npartitions=8)
     urlDats = list(db.map(convert_and_score, grid).compute())
     urlDats = list(filter(lambda url: len(list(url)) > 3, urlDats))
     urlDats = list(filter(lambda url: len(list(url.keys())) > 3, urlDats))
     urlDats = list(
         filter(lambda url: str('penalty') in url.keys(), urlDats))
     if type(self.urlDats) is not type(None):
         urlDats.extend(self.urlDats)
     return urlDats
예제 #15
0
파일: delay.py 프로젝트: sssangha/RAiDER
def _get_rays_d(lengths, stepSize, start_positions, scaled_look_vecs, Nproc=2):
    import dask.bag as db
    L = db.from_sequence(lengths)
    S = db.from_sequence(start_positions)
    Sv = db.from_sequence(scaled_look_vecs)
    Ss = db.from_sequence([stepSize] * len(lengths))

    # setup for multiprocessing
    data = db.zip(L, S, Sv, Ss)

    positions_l = db.map(helper, data)
    return positions_l.compute()
예제 #16
0
def run_grid(npoints,nparams,provided_keys=None):
    # not all models will produce scores, since models with rheobase <0 are filtered out.
    from neuronunit.optimization.optimization_management import nunit_evaluation
    from neuronunit.optimization.optimization_management import update_dtc_pop

    grid_points = create_grid(npoints = npoints,nparams = nparams,vprovided_keys = provided_keys )
    import dask.bag as db
    b = db.bag(grid_points)
    dtcpop = list(db.map(update_dtc_pop,b).compute())
    print(dtcpop)
    # The mapping of rheobase search needs to be serial mapping for now, since embedded in it's functionality is a
    # probably this can be bypassed in the future by using zeromq's Client (by using ipyparallel's core module/code base more directly)
    dtcpop = list(map(dtc_to_rheo,dtcpop))
    print(dtcpop)

    filtered_dtcpop = list(filter(lambda dtc: dtc.rheobase['value'] > 0.0 , dtcpop))
    dtcpop = list(db.map(nunit_evaluation,filtered_dtcpop).compute())
    dtcpop = list(dtcpop)
    dtcpop = list(filter(lambda dtc: type(dtc.scores['RheobaseTestP']) is not type(None), dtcpop))

    return dtcpop
예제 #17
0
def test_01a_compute_score(dtcpop, tests):
    from neuronunit.optimization import get_neab
    from neuronunit.optimization.optimization_management import dtc_to_rheo
    from neuronunit.optimization.optimization_management import nunit_evaluation
    from neuronunit.optimization.optimization_management import format_test
    from itertools import repeat
    #dtcpop = grid_points()
    rheobase_test = tests[0][0][0]

    xargs = list(zip(dtcpop, repeat(rheobase_test), repeat('NEURON')))
    dtclist = list(map(dtc_to_rheo, xargs))

    #dtclist = list(map(dtc_to_rheo,dtcpop))
    for d in dtclist:
        assert len(list(d.attrs.values())) > 0
    import dask.bag as db
    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(format_test, b0).compute())

    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(nunit_evaluation, b0).compute())
    return dtclist
예제 #18
0
    def dask_map(self, f, x: list) -> list:
        """A mapping function for Dask.  Used for multithreading.

        Args:
            f: Any function.
            x (list): A list of inputs to be sequentially passed
                to that function.

        Returns:
            list: A list of outputs from that function.
        """
        x = db.from_sequence(x, npartitions=self.npartitions)
        return db.map(f, x).compute()
예제 #19
0
def grid_points():
    npoints = 2
    nparams = 10
    from neuronunit.optimization.model_parameters import model_params
    provided_keys = list(model_params.keys())
    USE_CACHED_GS = False
    from neuronunit.optimization import exhaustive_search
    grid_points = exhaustive_search.create_grid(npoints = npoints,nparams = nparams)
    import dask.bag as db
    b0 = db.from_sequence(grid_points[0:2], npartitions=8)
    dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute())
    assert dtcpop is not None
    return dtcpop
예제 #20
0
def grid_points():
    npoints = 2
    nparams = 10
    from neuronunit.optimization.model_parameters import model_params
    provided_keys = list(model_params.keys())
    USE_CACHED_GS = False
    from neuronunit.optimization import exhaustive_search
    grid_points = exhaustive_search.create_grid(npoints=npoints,
                                                nparams=nparams)
    import dask.bag as db
    b0 = db.from_sequence(grid_points[0:2], npartitions=8)
    dtcpop = list(db.map(exhaustive_search.update_dtc_grid, b0).compute())
    assert dtcpop is not None
    return dtcpop
def get_bmarks():

    xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php')
    high_standard = str(
        'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D'
    )
    the_science_of_writing = str(
        'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf')
    pmeg = str(
        'http://www.elsewhere.org/pomo/'
    )  # Note this is so obfuscated, even the english language classifier rejects it.
    links = [xkcd_self_sufficient, high_standard, the_science_of_writing, pmeg]
    royal = '../BenchmarkCorpus/royal.txt'
    klpd = '../BenchmarkCorpus/planning_document.txt'
    klpdf = open(klpd)
    strText = klpdf.read()
    urlDat = {'link': 'local_resource'}

    klpdfp = text_proc(strText, urlDat, WORD_LIM=100)
    grid = db.from_sequence(links, npartitions=8)
    urlDats = list(db.map(process, grid).compute())
    urlDats.append(klpdfp)
    print(urlDats)

    klpdr = open(royal)
    strText = klpdr.read()
    urlDat = {'link': 'local_resource_royal'}

    klpdfr = text_proc(strText, urlDat, WORD_LIM=100)
    print(klpdfr)
    grid = db.from_sequence(links, npartitions=8)
    urlDats = list(db.map(process, grid).compute())
    urlDats.append(klpdfp)

    with open('benchmarks.p', 'wb') as f:
        pickle.dump(urlDats, f)
    return urlDats
예제 #22
0
def pmap(function, inputs, multiple=False, predicate=None):
    # type: (Callable[[Any], Any], Iterable[Iterable[Any]], bool, Callable[[Any], Any]) -> Iterable[Any]
    """
    Do a parallel map of the given :code:`function` on the given :code:`inputs` and optionally
    filter its results with :code:`predicate`. This is a simple wrapper for `dask`_ and works
    like :func:`map` but in parallel.

    .. code-block:: python

      fun = lambda d: SHA1.new(d).hexdigest()
      inputs = [b'1234', b'5678', b'9101', b'1121']
      assert (pmap(fun, inputs) ==
          ['7110eda4d09e062aa5e4a390b0a572ac0d2c0220',
           '2abd55e001c524cb2cf6300a89ca6366848a77d5',
           'f5a6fe40024c28967a354e591bb9fa21b784bf00',
           '784e9240155834852dff458a730cceb50229df32'])

      predicate = lambda d: d.endswith('0')
      assert (pmap(fun, inputs, predicate=predicate) ==
          ['7110eda4d09e062aa5e4a390b0a572ac0d2c0220',
           'f5a6fe40024c28967a354e591bb9fa21b784bf00'])

    .. _`dask`: https://docs.dask.org/en/latest/

    :param function:    An arbitrary function that's mapped to the :code:`inputs`.
    :param inputs:      Inputs for :code:`function`. Pass multiple if your function
                        takes multiple inputs.
    :param multiple:    Specifies whether :code:`inputs` contains multiple inputs or not.
                        If you want to e.g. pass two lists :code:`xs` and :code:`ys` to
                        :code:`function = lambda x,y: x + y`, you can pass
                        :code:`inputs = [xs, ys]` and `multiples=True` to interpret
                        :code:`inputs` as inputs for multiple arguments.
    :param filter_:     An optional filter function to filter the results of the
    :param predicate:   An optional filter function to filter the results of the
                        computation. If none is passed, all results will be returned.
    :returns:           The results of applying :code:`function` to :code:`inputs`.

    .. CAUTION::
       :code:`predicate` needs to be passed as a **keyword argument**, otherwise it will
       be treated as an input parameter to :code:`function`!
    """
    if not multiple:
        inputs = [inputs]
    promises = parallel.map(function,
                            *[parallel.from_sequence(i) for i in inputs])
    if not predicate:
        return list(promises)
    else:
        return list(promises.filter(predicate))
예제 #23
0
    def cas(self):
        # Do in parallel as it is 2018

        pgrid = db.from_sequence(self.files, npartitions=8)
        urlDats = list(db.map(self.convert_and_score, pgrid).compute())
        # just kidding need to do a serial debug often times, regardless of parallel speed up.
        # urlDats = list(map(self.convert_and_score,self.files))
        urlDats = [url for url in urlDats if type(url) is not type(None)]
        # urlDats = list(filter(lambda url: type(url) != None, urlDats))
        urlDats = list(filter(lambda url: len(list(url)) > 3, urlDats))

        urlDats = list(filter(lambda url: len(list(url.keys())) > 3, urlDats))
        # urlDats = list(filter(lambda url: str('penalty') in url.keys(), urlDats))
        if type(self.urlDats) is not type(None):
            urlDats.extend(self.urlDats)
        return urlDats
예제 #24
0
def predict(atm,
            series,
            dev_length,
            val_length,
            HP,
            week,
            mon,
            horizon,
            cpoint=0.05):

    #Predict for development dataset
    fitted_model, model1 = train(series[0:dev_length],
                                 series[0:dev_length],
                                 HP,
                                 week,
                                 mon,
                                 cpoint=cpoint)
    model1['pred_day'] = 'day 0'
    model1['tid'] = atm
    rmse1 = model1.groupby([model1.ds.dt.month,
                            'pred_day']).se.mean().agg(np.sqrt).reset_index()

    dbseries = db.from_sequence(
        [series[i:i + dev_length] for i in range(val_length)])
    dbforecast = db.from_sequence([
        series[i + dev_length:i + dev_length + horizon][['ds', 'y']]
        for i in range(val_length)
    ])

    dbmaster = db.map(train,
                      dbseries,
                      dbforecast,
                      HP,
                      week,
                      mon,
                      cpoint=cpoint)

    modelf = dbmaster.compute()
    model = pd.concat([modelf[i][1] for i in range(len(model))])
    model['tid'] = atm
    rmse = model.groupby([model.ds.dt.month,
                          'pred_day']).se.mean().agg(np.sqrt).reset_index()
    print(f'\nATM:{atm}  RMSE:{rmse.se.mean()}')
    return modelf[len(val_length) - 1][0], pd.concat(
        [model1, model]), pd.concat([rmse1, rmse])
예제 #25
0
 def test_map_wrapper(self):
     npoints = 2
     nparams = 3
     from neuronunit.optimization.model_parameters import model_params
     provided_keys = list(model_params.keys())
     USE_CACHED_GS = False
     from neuronunit.optimization import exhaustive_search
     from neuronunit.optimization.optimization_management import map_wrapper
     grid_points = exhaustive_search.create_grid(npoints = npoints,nparams = nparams)
     b0 = db.from_sequence(grid_points[0:2], npartitions=8)
     dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute())
     assert dtcpop is not None
     dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2])
     for i,j in enumerate(dtcpop):
         for k,v in dtcpop_compare[i].attrs.items():
             print(k,v,i,j)
             self.assertEqual(j.attrs[k],v)
     return True
예제 #26
0
def update_dtc_pop(pop, td=None, backend=None):
    '''
    inputs a population of genes/alleles, the population size MU, and an optional argument of a rheobase value guess
    outputs a population of genes/alleles, a population of individual object shells, ie a pickleable container for gene attributes.
    Rationale, not every gene value will result in a model for which rheobase is found, in which case that gene is discarded, however to
    compensate for losses in gene population size, more gene samples must be tested for a successful return from a rheobase search.
    If the tests return are successful these new sampled individuals are appended to the population, and then their attributes are mapped onto
    corresponding virtual model objects.
    '''

    import copy
    import numpy as np
    from deap import base
    toolbox = base.Toolbox()
    pop = [toolbox.clone(i) for i in pop]

    def transform(ind):
        import dask.bag as db
        from neuronunit.optimization.data_transport_container import DataTC
        dtc = DataTC()
        import neuronunit
        LEMS_MODEL_PATH = str(
            neuronunit.__path__[0]) + str('/models/NeuroML2/LEMS_2007One.xml')
        if backend is not None:
            dtc.backend = backend
        else:
            dtc.backend = 'NEURON'

        dtc.attrs = {}
        for i, j in enumerate(ind):
            dtc.attrs[str(td[i])] = j
        dtc.evaluated = False
        return dtc

    if len(pop) > 1:
        b = db.from_sequence(pop, npartitions=8)
        dtcpop = list(db.map(transform, b).compute())

    else:
        # In this case pop is not really a population but an individual
        # but parsimony of naming variables
        # suggests not to change the variable name to reflect this.
        dtcpop = list(transform(pop))
    return dtcpop
예제 #27
0
파일: fi.py 프로젝트: vrhaynes/neuronunit
 def find_rheobase(self, dtc):
     import dask.bag as db
     cnt = 0
     assert os.path.isfile(dtc.model_path), "%s is not a file" % dtc.model_path
     # If this it not the first pass/ first generation
     # then assume the rheobase value found before mutation still holds until proven otherwise.
     # dtc = check_current(model.rheobase,dtc)
     # If its not true enter a search, with ranges informed by memory
     cnt = 0
     while dtc.boolean == False:
         dtc_clones = [ dtc for s in dtc.current_steps ]
         b0 = db.from_sequence(dtc.current_steps, npartitions=8)
         b1 = db.from_sequence(dtc_clones, npartitions=8)
         dtcpop = list(db.map(check_current,b0,b1).compute())
         for dtc_clone in dtcpop:
             dtc.lookup.update(dtc_clone.lookup)
         dtc = check_fix_range(dtc)
         cnt += 1
     return dtc
예제 #28
0
def grid_points():
    npoints = 2
    nparams = 10
    from neuronunit.optimization.model_parameters import model_params
    provided_keys = list(model_params.keys())
    USE_CACHED_GS = False
    electro_path = 'pipe_tests.p'
    import pickle
    assert os.path.isfile(electro_path) == True
    with open(electro_path, 'rb') as f:
        electro_tests = pickle.load(f)
    from neuronunit.optimization import exhaustive_search
    grid_points = exhaustive_search.create_grid(npoints=npoints,
                                                nparams=nparams)
    import dask.bag as db
    b0 = db.from_sequence(grid_points[0:2], npartitions=8)
    dtcpop = list(db.map(exhaustive_search.update_dtc_grid, b0).compute())
    assert dtcpop is not None
    return dtcpop
예제 #29
0
def main():  # noqa: D103

    tscc_vcf_dir = f'{ukb}/../../resources/datasets/ukbiobank/exome/fe_crams'
    vcf_dir = tscc_vcf_dir
    bulk_floc = f'{ukb}/exome/fe_cram.bulk'

    assert os.path.exists(vcf_dir)
    assert os.path.exists(bulk_floc)

    current_files = set(os.listdir(tscc_vcf_dir))

    # why not retry? nothing else I can do
    dask.config.set({'distributed.scheduler.allowed-failures': 99})

    # Maximum of 10 concurrent downloads per application
    # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644
    client = dask.distributed.Client(
        n_workers=10, local_directory="/oasis/tscc/scratch/jmargoli")

    jobs = []
    # calculate number of download batches
    with open(bulk_floc) as bulk_file:
        for line in bulk_file:
            sample_ID, field_ID = line.split()
            if field_ID == '23163_0_0':
                suffix = 'cram'
            elif field_ID == '23164_0_0':
                suffix = 'cram.crai'
            file_name = f"{sample_ID}_{field_ID}.{suffix}"
            if file_name in current_files:
                continue
            jobs.append((ukb, sample_ID, field_ID, vcf_dir))

    print(f"Number of jobs queued: {len(jobs)}", flush=True)

    bag = dask.bag.from_sequence(jobs)
    downloads = bag.map(download_item)

    client.compute(downloads, retries=99).result()  # wait for the result so
예제 #30
0
    def kull(pop):
        dtcpop = list(update_dtc_pop(pop, td))
        dtcpop = list(map(dtc_to_rheo, dtcpop))
        dtcpop = list(filter(lambda dtc: dtc.rheobase['value'] > 0.0, dtcpop))
        while len(dtcpop) < len(pop):
            dtcpop.append(dtcpop[0])
        dtcpop = list(map(format_test, dtcpop))
        b = db.from_sequence(dtcpop, npartitions=8)
        dtcpop = list(db.map(nunit_evaluation, b, error_criterion).compute())
        dtcpop = list(
            filter(
                lambda dtc: not isinstance(dtc.scores['RheobaseTestP'],
                                           type(None)), dtcpop))
        dtcpop = list(
            filter(lambda dtc: not type(None) in (list(dtc.scores.values())),
                   dtcpop))
        dtcpop = list(
            filter(
                lambda dtc: not (numpy.isinf(x)
                                 for x in list(dtc.scores.values())), dtcpop))

        return dtcpop
예제 #31
0
    def apply_for_each_run_dir(self, action, client, status=Status.ENCODED):
        """
        For each run in this Campaign's run list, apply the specified action
        (an object of type Action)

        Parameters
        ----------
        action : the action to be applied to each run directory
            The function to be applied to each run directory. func() will
            be called with the run directory path as its only argument.
        client : a Dask client associated with a cluster you want to
            run your jobs on.

        Returns
        -------
        """
        run_dirs = []
        for run_id, run_data in self.campaign_db.runs(
                status=status, app_id=self._active_app['id']):
            run_dirs.append(run_data['run_dir'])
        bag = dask.bag.from_sequence(run_dirs)
        future = client.compute(bag.map(action.act_on_dir))
        future.result()
예제 #32
0
def test_bag_map():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a=1, b=2, c=3):
        return a + b + c

    assert db.map(myadd, b).compute() == list(map(myadd, x))
    assert db.map(myadd, a=b).compute() == list(map(myadd, x))
    assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2))
    assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x]

    sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, b=b2, c=100).compute() == sol

    sol = [myadd(i, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, c=100).compute() == sol

    x_sum = sum(x)
    sol = [myadd(x_sum, b=i, c=100) for i in x2]
    assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol

    sol = [myadd(i, b=x_sum, c=100) for i in x2]
    assert db.map(myadd, b2, b.sum(), c=100).compute() == sol

    sol = [myadd(a=100, b=x_sum, c=i) for i in x2]
    assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol

    a = dask.delayed(10)
    assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x]

    # Mispatched npartitions
    fewer_parts = db.from_sequence(range(100), npartitions=5)
    with pytest.raises(ValueError):
        db.map(myadd, b, fewer_parts)

    # No bags
    with pytest.raises(ValueError):
        db.map(myadd, b.sum(), 1, 2)

    # Unequal partitioning
    unequal = db.from_sequence(range(110), npartitions=10)
    with pytest.raises(ValueError):
        db.map(myadd, b, unequal, c=b2).compute()
    with pytest.raises(ValueError):
        db.map(myadd, b, b=unequal, c=b2).compute()
예제 #33
0
def build_index(use_bag=False):
    """
    An experiment is a collection of outputNNN directories.  Each directory
    represents the output of a single job submission script. These directories
    are created by the *payu* tool.

    This function creates and/or updates an index cache of variables names
    found in all NetCDF4 files.

    We can also examine the .nc files directly to infer their contents.
    for each .nc file, get variables -> dimensions

    .ncfile, varname, dimensions, chunksize

    """

    # Build index of all NetCDF files found in directories to search.

    ncfiles = []
    runs_available = []

    print('Finding runs on disk...', end='')
    for directoryToSearch in directoriesToSearch:
        #print('Searching {}'.format(directoryToSearch))

        # find all subdirectories
        results = subprocess.check_output(['find', directoryToSearch, '-maxdepth', '3', '-type', 'd',
            '-name', 'output???'])

        results = [s for s in results.decode('utf-8').split()]
        runs_available.extend(results)
    print('found {} run directories'.format( len(runs_available)))

    #ncfiles.extend(results)
#
#    results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc'])
#
#    print('Found {} .nc files'.format(len(ncfiles)))

    # We can persist this index by storing it in a sqlite database placed in a centrally available location.

    # The use of the `dataset` module hides the details of working with SQL directly.

    # In this database is a single table listing all variables in NetCDF4 seen previously.
    print('Using database {}'.format(database_url))
    print('Querying database...', end='')

    db = dataset.connect(database_url)

    # find list of all run directories
    r = db.query('SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles')

    runs_already_seen = [os.path.join(*row.values())
                         for row in r]

    print('runs already indexed: {}'.format(len(runs_already_seen)))

    runs_to_index = list(set(runs_available) - set(runs_already_seen))

    if len(runs_to_index) == 0:
        print("No new runs found.")
        return

    print('{} new run directories found including...'.format(len(runs_to_index)))

    for i in range(min(3, len(runs_to_index))):
        print(runs_to_index[i])
    if len(runs_to_index) > 3:
        print('...')

    print('Finding files on disk...')
    ncfiles = []
    for run in tqdm.tqdm_notebook(runs_to_index, leave=True):
        results = subprocess.check_output(['find', run, '-name', '*.nc'])
        results = [s for s in results.decode('utf-8').split()]

        ncfiles.extend(results)

    IPython.display.clear_output(wait=True)
    
    # NetCDF files found on disk not seen before:
    #files_to_add = set(ncfiles) - set(files_already_seen)

    files_to_add = ncfiles

    print('Files found but not yet indexed: {}'.format(len(files_to_add)))

    # For these new files, we can determine their configuration, experiment, and run.
    # Using NetCDF4 to get list of all variables in each file.

    # output* directories
    # match the parent and grandparent directory to configuration/experiment
    find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc')

    # determine general pattern for ncfile names
    find_basename_pattern = re.compile('(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)')

    def index_variables(ncfile):

        matched = find_output.match(ncfile)
        if matched is None:
            return []

        if not os.path.exists(ncfile):
            return []

        basename = os.path.basename(ncfile)
        m = find_basename_pattern.match(basename)
        if m is None:
            basename_pattern = basename
        else:
            basename_pattern = m.group('root') + ('__\d+_\d+' if m.group('index') else '') + ('.\d+-\d+' if m.group('indexice') else '')+ m.group('ext')

        try:
            with netCDF4.Dataset(ncfile) as ds:
                ncvars = [ {'ncfile': ncfile,
                   'rootdir': matched.group(1),
                   'configuration': matched.group(2),
                   'experiment' : matched.group(3),
                   'run' : matched.group(4),
                   'basename' : basename,
                   'basename_pattern' : basename_pattern,
                   'variable' : v.name,
                   'dimensions' : str(v.dimensions),
                   'chunking' : str(v.chunking()),
                   } for v in ds.variables.values()]
        except:
            print ('Exception occurred while trying to read {}'.format(ncfile))
            ncvars = []

        return ncvars

    if len(files_to_add) == 0:
        print("No new .nc files found.")
        return True

    print('Indexing new .nc files...')

    if use_bag:
        with distributed.Client() as client:
            bag = dask.bag.from_sequence(files_to_add)
            bag = bag.map(index_variables).flatten()

            futures = client.compute(bag)
            progress(futures, notebook=False)

            ncvars = futures.result()
    else:
        ncvars = []
        for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False):
            ncvars.extend(index_variables(file_to_add))
        IPython.display.clear_output()
        
    print('')
    print('Found {} new variables'.format(len(ncvars)))

    print('Saving results in database...')
    db['ncfiles'].insert_many(ncvars)

    print('Indexing complete.')

    return True
예제 #34
0
파일: test_bag.py 프로젝트: postelrich/dask
def test_bag_map():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a=1, b=2, c=3):
        return a + b + c

    assert db.map(myadd, b).compute() == list(map(myadd, x))
    assert db.map(myadd, a=b).compute() == list(map(myadd, x))
    assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2))
    assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x]

    sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, b=b2, c=100).compute() == sol

    sol = [myadd(i, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, c=100).compute() == sol

    x_sum = sum(x)
    sol = [myadd(x_sum, b=i, c=100) for i in x2]
    assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol

    sol = [myadd(i, b=x_sum, c=100) for i in x2]
    assert db.map(myadd, b2, b.sum(), c=100).compute() == sol

    sol = [myadd(a=100, b=x_sum, c=i) for i in x2]
    assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol

    a = dask.delayed(10)
    assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x]

    # Mispatched npartitions
    fewer_parts = db.from_sequence(range(100), npartitions=5)
    with pytest.raises(ValueError):
        db.map(myadd, b, fewer_parts)

    # No bags
    with pytest.raises(ValueError):
        db.map(myadd, b.sum(), 1, 2)

    # Unequal partitioning
    unequal = db.from_sequence(range(110), npartitions=10)
    with pytest.raises(ValueError):
        db.map(myadd, b, unequal, c=b2).compute()
    with pytest.raises(ValueError):
        db.map(myadd, b, b=unequal, c=b2).compute()