Пример #1
0
def main(_):
    #Generate scheduler
    data = da.from_array(np.array(Image.open(r'dota2.jpg')),
                         chunks=(600, 400, 3))
    client = Client(args.address)
    client.upload_file('calcov.py')

    temp3 = np.zeros((3, 3))
    temp3[0, :] = [0.062467, 0.125000, 0.062467]
    temp3[1, :] = [0.125000, 0.250131, 0.125000]
    temp3[2, :] = [0.062467, 0.125000, 0.062467]

    D = []
    B = []
    for i in range(args.queue):
        D.append(np.array(data + i * 10))
        B.append(temp3 + 0.05)

    future = client.map(calcov.calCov, B, D)
    result = [[np.array(_[0]), str(_[1]), str(_[2])]
              for _ in client.gather(future)]

    shutil.rmtree(r'./data', ignore_errors=True)
    os.mkdir(r'./data')
    i = 0
    for _ in result:
        data = _[0]
        time = _[1]
        name = _[2].strip('tcp://')
        new_im = Image.fromarray(data)
        new_im.save('./data/result_%s_%s_(%s).jpg' % (i, time, name))
        i += 1
Пример #2
0
def test_run_multiple_computational_sidecar_dask(
    event_loop: asyncio.AbstractEventLoop,
    dask_client: Client,
    ubuntu_task: ServiceExampleParam,
    mocker: MockerFixture,
):
    NUMBER_OF_TASKS = 50

    mocker.patch(
        "simcore_service_dask_sidecar.computational_sidecar.core.get_integration_version",
        autospec=True,
        return_value=ubuntu_task.integration_version,
    )
    futures = [
        dask_client.submit(
            run_computational_sidecar,
            ubuntu_task.docker_basic_auth,
            ubuntu_task.service_key,
            ubuntu_task.service_version,
            ubuntu_task.input_data,
            ubuntu_task.output_data_keys,
            ubuntu_task.log_file_url,
            ubuntu_task.command,
            resources={},
        ) for _ in range(NUMBER_OF_TASKS)
    ]

    results = dask_client.gather(futures)

    # for result in results:
    # check that the task produce the expected data, not less not more
    for output_data in results:
        for k, v in ubuntu_task.expected_output_data.items():
            assert k in output_data
            assert output_data[k] == v
Пример #3
0
def main():
    #define parallel mcmc wrapper
    def parallel_mcmc(_):
        return (mcmc(initial_parameters=epa_0,
                     proposer=normal_prop,
                     param2res=param2res,
                     costfunction=costfunction,
                     nsimu=5000))

    #check jobs resources to initialize dask workers
    num_threads = int(
        environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1)))
    initialize(interface='ib0', nthreads=num_threads)
    client = Client()

    #run 10 chains
    [[c_form1, j_form1], [c_form2, j_form2], [c_form3, j_form3],
     [c_form4, j_form4], [c_form5, j_form5], [c_form6, j_form6],
     [c_form7, j_form7], [c_form8, j_form8], [c_form9, j_form9],
     [c_form10,
      j_form10]] = client.gather(client.map(parallel_mcmc, range(0, 10)))

    #print chain5 output as test
    formal_c_path = dataPath.joinpath('chain5_pmcmc_c.csv')
    formal_j_path = dataPath.joinpath('chain5_pmcmc_j.csv')
    pd.DataFrame(c_form5).to_csv(formal_c_path, sep=',')
    pd.DataFrame(j_form5).to_csv(formal_j_path, sep=',')
Пример #4
0
class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """
    def __init__(self, n_workers):
        """
        Initiates a LocalDaskDistributor instance.

        Parameters
        ----------
        n_workers : int
            How many workers should the local dask cluster have?
        """

        super().__init__()
        import tempfile

        from distributed import Client, LocalCluster

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers,
                               processes=False,
                               local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        Parameters
        ----------
        func : Callable
            Function to send to each worker.
        partitioned_chunks : List
            List of data chunks, each chunk is processed by one woker
        kwargs : Dict
            Parameters for the map function
        Returns
        -------
        List
            The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return result

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()
Пример #5
0
class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """
    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        :param data_length: A length which defines how many calculations there need to be.
        :type data_length: int
        """
        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()
Пример #6
0
def dask_evaluate(outputs):
    utils.port_increment += 2
    scheduler_port = 8786 + utils.port_increment
    diagnostics_port = 8787 + utils.port_increment

    cluster = LocalCluster(n_workers=1, threads_per_worker=10, nanny=False,
                           scheduler_port=scheduler_port, diagnostics_port=diagnostics_port)
    client = Client(cluster)
    futures = client.persist(outputs)
    return client.gather(futures)
Пример #7
0
class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """

    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        :param address: the ip address and port number of the Dask Scheduler
        :type address: str
        """

        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        :param data_length: A length which defines how many calculations there need to be.
        :type data_length: int
        """
        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()
Пример #8
0
def main():
    #get command line arguments controling launch
    threads = 1
    workers = 8
    for x in sys.argv[1:]:
        if x.find("threads") > -1:
            z = x.split("=")
            threads = int(z[1])
        if x.find("workers") > -1:
            z = x.split("=")
            workers = int(z[1])


# launch with either threads and/or workers specified (0 = default)
    if threads == 0 and workers != 0:
        print("lanching  %d workers, default threads" % (workers))
        cluster = LocalCluster(n_workers=workers)
    if threads != 0 and workers == 0:
        print("lanching  %d threads, defalut workers" % (threads))
        cluster = LocalCluster(threads_per_worker=threads)
    if threads != 0 and workers != 0:
        print("lanching  %d workers  with %d threads" % (workers, threads))
        cluster = LocalCluster(n_workers=workers, threads_per_worker=threads)
    print(cluster)
    client = Client(cluster)
    print(client)

    # do serial
    # NOTE: it is possible to launch an asynchronous client
    # but here we just do serial synchronous.  See:
    # https://distributed.dask.org/en/latest/asynchronous.html
    result = []
    print("   pid  Start T")
    for i in range(0, 5):
        j = 2
        result.append(client.submit(test, i, j).result())
    print(result)
    print(Counter(result))
    #do parallel
    n = 15
    np.random.seed(1234)
    x = np.random.random(n) * 20
    #set to uniform nonzero to get uniform run times for each task
    x = np.ones(n) * 10
    print(x)
    print("   pid  Start T")
    L = client.map(test, range(n), x)
    mylist = client.gather(L)
    pids = []
    for m in mylist:
        x = m.split()[0]
        pids.append(x)
        print(m)
    pids = sorted(set(pids))
    print(len(pids), pids)
Пример #9
0
class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """
    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers,
                               processes=False,
                               local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iteratables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()
Пример #10
0
def run(spec: dict, scheduler: str):
    class CompleteDaskJob:
        def __init__(self, message: str = ""):
            self.message = message

    class InvalidDaskJob():
        def __init__(self, message: str = ""):
            self.message = message

    class DaskQueryJob():
        def __init__(self, job_spec: dict):
            self.query_string = job_spec.get("query_string")
            self.database = job_spec.get("database")
            self.output_path = job_spec.get("output_path")

        def run_job(self) -> Union[CompleteDaskJob, InvalidDaskJob]:
            # df: DataFrame = dd.read_sql_table(self.query_string)
            if self.output_path:
                # df.to_parquet(self.output_path)
                return CompleteDaskJob(
                    f"Job to query via Dask succesfully queued to scheduler")
            else:
                return InvalidDaskJob(
                    "Output path required for Dask implementation of table query"
                )

    dask_job = DaskQueryJob(spec)
    mode = "async"

    if scheduler == "local":
        client = Client()
        dask_job.run_job()
    else:
        dask.config.set({'distributed.scheduler.allowed-failures': 50})
        client = Client(scheduler)
        future = client.submit(dask_job.run_job)
        if mode == "sync":
            client.gather(future)
        else:
            fire_and_forget(future)
Пример #11
0
class LocalDaskDistributor(DistributorBaseClass):
    """
    Distributor using a local dask cluster and inproc communication.
    """

    def __init__(self, n_workers):
        """

        Initiates a LocalDaskDistributor instance.

        :param n_workers: How many workers should the local dask cluster have?
        :type n_workers: int
        """

        from distributed import LocalCluster, Client
        import tempfile

        # attribute .local_dir_ is the path where the local dask workers store temporary files
        self.local_dir_ = tempfile.mkdtemp()
        cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_)

        self.client = Client(cluster)
        self.n_workers = n_workers

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local
        machine

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks))
        return [item for sublist in result for item in sublist]

    def close(self):
        """
        Closes the connection to the local Dask Scheduler
        """
        self.client.close()
Пример #12
0
def main():
    """
    Use the Dask distributed client to run a function in parallel.
    """
    client = Client(n_workers=8)

    numbers = [3, 4, 5, 8, 12, 18, 25]
    futures = []

    for n in numbers:
        a = client.submit(adder, n)
        futures.append(a)

    results = client.gather(futures)
    print(results)

    client.close()
def test_use_with_dask():
    try:
        import dask
        import dask.distributed
        from distributed import Client
    except ImportError:
        import warnings

        warnings.warn("Dask and/or Distributed are not installed")
        return
    with open(f"{CURRENT_DIR}/test-ogusa-remote.json") as f:
        remote_outputs = json.loads(f.read())
    outputs = cs_storage.read(remote_outputs["outputs"])

    c = Client()
    futures = c.map(cs_storage.screenshot, outputs["renderable"])
    results = c.gather(futures)
    for result in results:
        assert isinstance(result, bytes)
Пример #14
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser()
    #parser.add_argument('min_num', type=int)
    #parser.add_argument('max_num', type=int)
    args = parser.parse_args()

    num_threads = int(
        environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1)))
    initialize(interface='ib0', nthreads=num_threads)
    client = Client()

    min_num = 10
    max_num = 100
    start_time = datetime.now()
    num_primes = sum(
        client.gather(client.map(slow_is_prime, range(min_num, max_num + 1))))
    end_time = datetime.now()

    print(f'{num_primes} primes between {min_num} and {max_num} '
          f'[{end_time - start_time}]')
Пример #15
0
class DaskClient(Thread):
    def __init__(self, clientUrl, clientId, daqObjectGenerator, resultQ):
        Thread.__init__(self, name='DaskClient-%s' % clientId)
        self.client = Client(clientUrl)
        self.clientId = clientId
        self.daqObjectGenerator = daqObjectGenerator
        self.resultQ = resultQ
        self.idQ = Queue()
        self.remoteIdQ = self.client.scatter(self.idQ)
        self.generatorQ = self.client.map(self.daqObjectGenerator.generate,
                                          self.remoteIdQ)
        self.pvQ = self.client.gather(self.generatorQ)
        self.nGenerated = 0
        self.event = Event()

    def putTask(self, objectId):
        #t0 = time.time()
        self.idQ.put(objectId)
        #t1 = time.time()
        #dt = t1-t0
        #print('PUSH TASK: %s' % dt)
        #self.event.set()

    def getPv(self, timeout=None):
        #t0 = time.time()
        pv = self.pvQ.get(timeout=timeout)
        #t1 = time.time()
        #dt = t1-t0
        #print('GET PV: %s' % dt)
        return pv

    def run(self):
        print('STARTING THREAD, CLIENT ID: %s' % self.clientId)
        while True:
            pv = self.pvQ.get(timeout=None)
            self.nGenerated += 1
            #print('GOT PV , CLIENT ID %s: %s' % (self.clientId, pv['ArrayId']))
            #print('CLIENT ID %s: N GENERATED=%s' % (self.clientId, self.nGenerated))
            self.resultQ.put((pv, self.clientId))
Пример #16
0
def dask_compute_grid(ddclient=None, func=None, **kwargs):
    temp_cluster = False
    completed = []
    
    if ddclient is None:
        print('creating local dask distributed cluster...')
        # ddclient = Client()
        ddclient = Client()

        temp_cluster = True
#         print('cluster dashboard available at: ' + get_ddclient_dashboard_address(ddclient))
    
    try:
        print('cluster dashboard available at: ' + dask_get_ddclient_dashboard_address(ddclient))
        from IPython.display import display
        display(ddclient)
        tfunc = make_return_tuple(func)
        kwargs_list = ([(k, i) for i in v] for k, v in kwargs.items())
        
        # tuple of cartesian products of {{(arg_name, arg_val) | arg_val in arg_vals} | arg_name in arg_names}
        cart_prod_tup = product(*kwargs_list)
        cart_prod_dicts = [dict(i) for i in cart_prod_tup]

        print('submitting {} jobs to cluster...'.format(len(cart_prod_dicts)))
        futures = [ddclient.submit(tfunc, **kwargs) for kwargs in cart_prod_dicts]

        print('computing jobs...')
        completed = ddclient.gather(futures)

        print('computation done')
    
    finally:
        if temp_cluster:
            print('shutting down cluster...')
            ddclient.close()
    
    print('done')
    return completed
def test_distributed_handler_distributed(values, expected_values):
    cluster = LocalCluster(processes=False)

    with DistributedHandler(cluster.scheduler_address) as handler:
        futures = handler.client.map(lambda x: x + 1, values)
        handler_map_results = handler.gather(futures)

    with DistributedHandler(cluster.scheduler_address) as handler:
        handler_batched_results = handler.batched_map(lambda x: x + 1, values)

    client = Client(cluster)
    futures = client.map(lambda x: x + 1, values)

    distributed_results = client.gather(futures)

    handler_map_results = set(handler_map_results)
    handler_batched_results = set(handler_batched_results)
    distributed_results = set(distributed_results)

    assert (handler_map_results == handler_batched_results
            and handler_map_results == distributed_results)

    cluster.close()
Пример #18
0
class ClusterDaskDistributor(DistributorBaseClass):
    """
    Distributor using a dask cluster, meaning that the calculation is spread over a cluster
    """
    def __init__(self, address):
        """
        Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features

        Parameters
        ----------
        address : str
            The ip address and port number of the Dask Scheduler
        """

        super().__init__()
        from distributed import Client

        self.client = Client(address=address)

    def calculate_best_chunk_size(self, data_length):
        """
        Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction)
        to find the optimal chunk_size.

        Parameters
        ----------
        data_length: int
            A length which defines how many calculations there need to be.
        """

        n_workers = len(self.client.scheduler_info()["workers"])
        chunk_size, extra = divmod(data_length, n_workers * 5)
        if extra:
            chunk_size += 1
        return chunk_size

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster

        Parameters
        ----------
        func : Callable
            Function to send to each worker.
        partitioned_chunks : List
            List of data chunks, each chunk is processed by one woker
        kwargs : Dict
            Parameters for the map function
        Returns
        -------
        List
            The result of the calculation as a list - each item should be the result of the application of func
            to a single element
        """

        if isinstance(partitioned_chunks, Iterable):
            # since dask 2.0.0 client map no longer accepts iterables
            partitioned_chunks = list(partitioned_chunks)
        result = self.client.gather(
            self.client.map(partial(func, **kwargs), partitioned_chunks))
        return result

    def close(self):
        """
        Closes the connection to the Dask Scheduler
        """
        self.client.close()
Пример #19
0
        a.append(url)
    return a

def get_url(r):
    url = 'https://s3.amazonaws.com/cloudydap/bytestream/'+r['md5']
    return url

def compute(url):
    # print url
    response = urllib2.urlopen(url)
    buf = response.read()
    # print len(buf)
    dec = zlib.decompressobj(32+zlib.MAX_WBITS)
    unzipped = dec.decompress(buf)
    # print len(unzipped)
    # Pick a specific point
    a = unzipped[1]+unzipped[13104]+unzipped[26208]+unzipped[39312]
    # print struct.unpack('<f', a)
    return struct.unpack('<f', a)

# a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:MERRA2_100*")
a = search("PRECCU AND chunk_position:\[0,91,288\] AND filename:MERRA2_100*")
# a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:*tavgM_2d_int_*")
# search("PRECCU AND chunk_position:\[0,91,288\] AND filename: MERRA2_400.tavgM_2d_int_Nx.201507.nc4")


c = Client('localhost:8786')
m = c.map(compute, a)
x = c.gather(m)
print x
Пример #20
0
    arg_parser.add_argument('--n',
                            type=int,
                            default=100,
                            help='number of terms in sum')
    arg_parser.add_argument('--verbose',
                            action='store_true',
                            help='give verbose output')
    options = arg_parser.parse_args()
    client = Client('{0}:{1}'.format(options.scheduler,
                                     options.scheduler_port))
    if options.verbose:
        print('Client: {0}'.format(str(client)), flush=True)
    futures = client.map(square, range(options.n))
    total = client.submit(sum, futures)
    expected_total = (options.n - 1) * options.n * (2 * options.n - 1) // 6
    print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(
        total.result(), expected_total))
    futures = client.map(get_hostname, range(options.n))
    process_locations = client.gather(futures)
    if options.verbose:
        print('task placement:')
        print('\t' + '\n\t'.join(process_locations))
    count = dict()
    for process_location in process_locations:
        _, _, hostname = process_location.split()
        if hostname not in count:
            count[hostname] = 0
        count[hostname] += 1
    for hostname, nr_tasks in count.items():
        print('{0:d} tasks on {1}'.format(nr_tasks, hostname))
Пример #21
0
with np.load("../dask_fft_data_s0000.npz") as df:
    num_channels, num_fft = df["fft_data"].shape
    print(num_channels, num_fft)

    fft_data = da.from_array(df["fft_data"], chunks=(1, num_fft))
    dask_client.persist(fft_data)

# Calculate the crosspower using the array interface
res1 = (fft_data[:2, :] * fft_data[-2:, :].conj()).mean(axis=1)
print("type res1 = ", type(res1))
res2 = da.arctan2(res1.real, res1.imag).real
print("type res2 = ", type(res2))
print("result res2 = ", res2.compute())


# Calculate the crosspower using the distributed interface
def cross_phase(ft_data, ch1, ch2):

    _tmp1 = (ft_data[ch1, :] * ft_data[ch2, :].conj()).mean().compute()
    print("** crosspower: type(tmp1) =", type(_tmp1))
    _tmp2 = np.arctan2(_tmp1.real, _tmp1.imag).real
    #_tmp2 = _tmp1.real + _tmp1.imag

    return (_tmp2)


res_d = dask_client.submit(cross_phase, fft_data, 1, 6)
print("type resd = ", type(res_d))
print("results resd = ", dask_client.gather(res_d))

# End of file test_crossphase.py
Пример #22
0
# Set up scheduler
s = Scheduler(loop=loop)
s.start()

#Set up Workers
w = Worker('comet-14-02.sdsc.edu', loop=loop)
w.start(0)

# Set up client
client = Client('comet-14-02.sdsc.edu:8786')


def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]


#pprint.pprint(list(chunks(range(0, 255), 64)))
output = []
y = list(chunks(range(0, 255), 64))
#print y[0]

for ix in y:
    a = client.map(sum, ix)
    output.append(a)

total = client.submit(sum, output)
total.visualize()
print total.compute()
client.gather(total)
def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
Пример #24
0
from distributed import Client
import time

client = Client("192.168.0.106:8786")
client.restart()

from funcs import create_dirs, get_dirs, add_flag

future = client.map(create_dirs, range(100))
flags = client.submit(get_dirs, future)
client.gather(flags)
print(flags)
Пример #25
0
class Distributed(object):
    '''
    Distributed objects represent SUMMA configurations where
    there are multiple GRU/HRU which are expected to be run
    in parallel.

    Currently only supports GRU based parallelization.

    Attributes
    ----------
    executable:
        Path to the SUMMA executable
    manager:
        FileManager object
    num_workers:
        Number of parallel workers to use
    chunk_args:
        List of dictionaries containing ``startGRU`` and ``countGRU`` values
    simulations:
        Dictionary of run names and Simulation objects
    '''
    def __init__(self,
                 executable: str,
                 filemanager: str,
                 num_workers: int = 1,
                 threads_per_worker: int = OMP_NUM_THREADS,
                 chunk_size: int = None,
                 num_chunks: int = None,
                 scheduler: str = None,
                 client: Client = None):
        """
        Initialize a new distributed object

        Parameters
        ----------
        executable:
            Path to the SUMMA executable
        filemanager:
            Path to the file manager
        num_workers:
            Number of workers to use for parallel runs
        threads_per_worker:
            Number of threads each worker has
        chunk_size:
            Number of GRU per job
            (cannot be used with num_chunks)
        num_chunks:
            How many jobs to split the run into
            (Cannot be used with chunk_size)
        scheduler:
            Not used currently
        """
        self._status = 'Initialized'
        self.executable = executable
        self.manager_path = Path(os.path.abspath(
            os.path.realpath(filemanager)))
        self.manager = FileManager(self.manager_path.parent,
                                   self.manager_path.name)
        self.simulations: Dict[str, Simulation] = {}
        self.submissions: List = []
        self.num_workers: int = num_workers
        # Try to get a client, and if none exists then start a new one
        if client:
            self._client = client
            workers = len(self._client.get_worker_logs())
            if workers <= self.num_workers:
                self._client.cluster.scale(workers)
        else:
            try:
                self._client = get_client()
                # Start more workers if necessary:
                workers = len(self._client.get_worker_logs())
                if workers <= self.num_workers:
                    self._client.cluster.scale(workers)
            except ValueError:
                self._client = Client(n_workers=self.num_workers,
                                      threads_per_worker=threads_per_worker)
        self.chunk_args = self._generate_args(chunk_size, num_chunks)
        self._generate_simulation_objects()

    def _generate_simulation_objects(self):
        """
        Create each of the required simulation objects
        """
        for argdict in self.chunk_args:
            start = argdict['startGRU']
            stop = argdict['startGRU'] + argdict['countGRU'] - 1
            name = f"g{start}-{stop}"
            self.simulations[name] = Simulation(self.executable,
                                                self.manager_path, False)

    def _generate_args(self, chunk_size: int = None, num_chunks: int = None):
        '''
        Generate the arguments that will be used to start multiple
        runs from the base ``self.simulation``
        '''
        assert not (chunk_size and num_chunks), \
            "Only specify at most one of `chunk_size` or `num_chunks`!"
        start, stop = 0, 0
        sim_size = len(self.manager.local_attributes['gru'])
        if not (chunk_size or num_chunks):
            chunk_size = 12
        if chunk_size:
            sim_truncated = (chunk_size - 1) * (sim_size // (chunk_size - 1))
            starts = np.arange(1, sim_truncated + 1, chunk_size).astype(int)
            stops = np.append(starts[1:], sim_size + 1)
            chunks = np.vstack([starts, stops]).T
        elif num_chunks:
            chunk_size = np.ceil(sim_size / num_chunks).astype(int)
            starts = np.arange(1, sim_size, chunk_size)
            stops = np.append(starts[1:], sim_size + 1)
            chunks = np.vstack([starts, stops]).T
        return [{
            'startGRU': start,
            'countGRU': stop - start
        } for start, stop in chunks]

    def start(self, run_option: str, prerun_cmds: List = None):
        """
        Start running the ensemble members.

        Parameters
        ----------
        run_option:
            The run type. Should be either 'local' or 'docker'
        prerun_cmds:
            A list of preprocessing commands to run
        """
        for idx, (name, sim) in enumerate(self.simulations.items()):
            kwargs = self.chunk_args[idx]
            self.submissions.append(
                self._client.submit(_submit, sim, name, run_option,
                                    prerun_cmds, kwargs))

    def run(self, run_option: str, prerun_cmds=None, monitor: bool = True):
        """
        Run the ensemble

        Parameters
        ----------
        run_option:
            Where to run the simulation. Can be ``local`` or ``docker``
        prerun_cmds:
            A list of shell commands to run before running SUMMA
        monitor:
            Whether to halt operation until runs are complete
        """
        self.start(run_option, prerun_cmds)
        if monitor:
            return self.monitor()
        else:
            return True

    def monitor(self):
        """
        Halt computation until submitted simulations are complete
        """
        simulations = self._client.gather(self.submissions)
        for s in simulations:
            self.simulations[s.run_suffix] = s

    def merge_output(self):
        pass
Пример #26
0
    arg_parser.add_argument('--scheduler', help='scheduler host')
    arg_parser.add_argument('--scheduler_port', default='8786',
                            help='scheduler port to use')
    arg_parser.add_argument('--n', type=int, default=100,
                            help='number of terms in sum')
    arg_parser.add_argument('--verbose', action='store_true',
                            help='give verbose output')
    options = arg_parser.parse_args()
    client = Client('{0}:{1}'.format(options.scheduler,
                                     options.scheduler_port))
    if options.verbose:
        print('Client: {0}'.format(str(client)), flush=True)
    futures = client.map(square, range(options.n))
    total = client.submit(sum, futures)
    expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6
    print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(),
                                                           expected_total))
    futures = client.map(get_hostname, range(options.n))
    process_locations = client.gather(futures)
    if options.verbose:
        print('task placement:')
        print('\t' + '\n\t'.join(process_locations))
    count = dict()
    for process_location in process_locations:
        _, _, hostname = process_location.split()
        if hostname not in count:
            count[hostname] = 0
        count[hostname] += 1
    for hostname, nr_tasks in count.items():
        print('{0:d} tasks on {1}'.format(nr_tasks, hostname))
Пример #27
0
class Ensemble(object):
    '''
    Ensembles represent an multiple SUMMA configurations based on
    changing the decisions or parameters of a given run.

    Attributes
    ----------
    executable:
        Path to the SUMMA executable
    filemanager: (optional)
        Path to the file manager
    configuration:
        Dictionary of runs, along with settings
    num_workers:
        Number of parallel workers to use
    simulations:
        Dictionary of run names and Simulation objects
    '''
    def __init__(self,
                 executable: str,
                 configuration: dict,
                 filemanager: str = None,
                 num_workers: int = 1,
                 threads_per_worker: int = OMP_NUM_THREADS,
                 scheduler: str = None,
                 client: Client = None):
        """
        Create a new Ensemble object. The API mirrors that of the
        Simulation object.
        """
        self._status = 'Initialized'
        self.executable: str = executable
        self.filemanager: str = filemanager
        self.configuration: dict = configuration
        self.num_workers: int = num_workers
        self.simulations: dict = {}
        self.submissions: list = []
        # Try to get a client, and if none exists then start a new one
        if client:
            self._client = client
            workers = len(self._client.get_worker_logs())
            if workers <= self.num_workers:
                self._client.cluster.scale(workers)
        else:
            try:
                self._client = get_client()
                # Start more workers if necessary:
                workers = len(self._client.get_worker_logs())
                if workers <= self.num_workers:
                    self._client.cluster.scale(workers)
            except ValueError:
                self._client = Client(n_workers=self.num_workers,
                                      threads_per_worker=threads_per_worker)
        self._generate_simulation_objects()

    def _generate_simulation_objects(self):
        """
        Create a mapping of configurations to the simulation objects.
        """
        if self.filemanager:
            for name, config in self.configuration.items():
                self.simulations[name] = Simulation(self.executable,
                                                    self.filemanager, False)
        else:
            for name, config in self.configuration.items():
                assert config['file_manager'] is not None, \
                    "No filemanager found in configuration or Ensemble!"
                self.simulations[name] = Simulation(self.executable,
                                                    config['file_manager'],
                                                    False)

    def _generate_coords(self):
        """
        Generate the coordinates that can be used to merge the output
        of the ensemble runs into a single dataset.
        """
        decision_dims = ChainDict()
        manager_dims = ChainDict()
        parameter_dims = ChainDict()
        for name, conf in self.configuration.items():
            for k, v in conf.get('decisions', {}).items():
                decision_dims[k] = v
            for k, v in conf.get('file_manager', {}).items():
                manager_dims[k] = v
            #for k, v in conf.get('parameters', {}).items():
            #    parameter_dims[k] = v
            for k, v in conf.get('trial_parameters', {}).items():
                parameter_dims[k] = v
        return {
            'decisions': decision_dims,
            'managers': manager_dims,
            'parameters': parameter_dims
        }

    def merge_output(self):
        """
        Open and merge all of the output datasets from the ensemble
        run into a single dataset.
        """
        nc = self._generate_coords()
        new_coords = (list(nc.get('decisions', {})) +
                      list(nc.get('parameters', {})))
        decision_tuples = [
            tuple(n.split('++')[1:-1]) for n in self.configuration.keys()
        ]
        for i, t in enumerate(decision_tuples):
            decision_tuples[i] = tuple(
                (float(l.split('=')[-1]) if '=' in l else l for l in t))
        decision_names = [
            '++'.join(tuple(n.split('++')[1:-1]))
            for n in self.configuration.keys()
        ]
        if sum([len(dt) for dt in decision_tuples]) == 0:
            raise NameError("Simulations in the ensemble do not share all"
                            " common decisions! Please use `open_output`"
                            " to retrieve the output of this Ensemble")
        for i, t in enumerate(decision_names):
            decision_names[i] = '++'.join(l.split('=')[0] for l in t)
        new_idx = pd.MultiIndex.from_tuples(decision_tuples, names=new_coords)
        out_file_paths = [
            s.get_output_files() for s in self.simulations.values()
        ]
        out_file_paths = [fi for sublist in out_file_paths for fi in sublist]
        full = xr.open_mfdataset(out_file_paths,
                                 concat_dim='run_number',
                                 combine='nested')
        merged = full.assign_coords(run_number=decision_names)
        merged['run_number'] = new_idx
        merged = merged.unstack('run_number')
        return merged

    def start(self, run_option: str, prerun_cmds: list = None):
        """
        Start running the ensemble members.

        Parameters
        ----------
        run_option:
            The run type. Should be either 'local' or 'docker'
        prerun_cmds:
            A list of preprocessing commands to run
        """
        for n, s in self.simulations.items():
            # Sleep calls are to ensure writeout happens
            config = self.configuration[n]
            self.submissions.append(
                self._client.submit(_submit, s, n, run_option, prerun_cmds,
                                    config))

    def run(self, run_option: str, prerun_cmds=None, monitor: bool = True):
        """
        Run the ensemble

        Parameters
        ----------
        run_option:
            Where to run the simulation. Can be ``local`` or ``docker``
        prerun_cmds:
            A list of shell commands to run before running SUMMA
        monitor:
            Whether to halt operation until runs are complete
        """
        self.start(run_option, prerun_cmds)
        if monitor:
            return self.monitor()
        else:
            return True

    def map(self, fun, args, include_sims=True, monitor=True):
        for n, s in self.simulations.items():
            config = self.configuration[n]
            if include_sims:
                all_args = (s, n, *args, {'config': config})
            else:
                all_args = (*args, {'config': config})
            self.submissions.append(self._client.submit(fun, *all_args))
        if monitor:
            return self.monitor()
        else:
            return True

    def monitor(self):
        """
        Halt computation until submitted simulations are complete
        """
        simulations = self._client.gather(self.submissions)
        for s in simulations:
            self.simulations[s.run_suffix] = s

    def summary(self):
        """
        Show the user information about ensemble status
        """
        success, error, other = [], [], []
        for n, s in self.simulations.items():
            if s.status == 'Success':
                success.append(n)
            elif s.status == 'Error':
                error.append(n)
            else:
                other.append(n)
        return {'success': success, 'error': error, 'other': other}

    def rerun_failed(self,
                     run_option: str,
                     prerun_cmds=None,
                     monitor: bool = True):
        """
        Try to re-run failed simulations.

        Parameters
        ----------
        run_option:
            Where to run the simulation. Can be ``local`` or ``docker``
        prerun_cmds:
            A list of shell commands to run before running SUMMA
        monitor:
            Whether to halt operation until runs are complete
        """
        run_summary = self.summary()
        self.submissions = []
        for n in run_summary['error']:
            config = self.configuration[n]
            s = self.simulations[n]
            s.reset()
            self.submissions.append(
                self._client.submit(_submit, s, n, run_option, prerun_cmds,
                                    config))
        if monitor:
            return self.monitor()
        else:
            return True
Пример #28
0
def preprocessing_script():
    """
    This script will process all the hybridization folders combined in a 
    processing folder. The input parameters are passed using arparse

    Parameters:
    -----------
    
    scheduler: string
        tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). 
        default = False. If False the process will run on the local computer using nCPUs-1

    path: string
        Path to the processing directory


    """


    # Inputs of the function
    parser = argparse.ArgumentParser(description='Preprocessing script')
    parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003')
    parser.add_argument('-path', help='processing directory')
    args = parser.parse_args()
    
    # Directory to process
    processing_directory = args.path
    # Dask scheduler address
    scheduler_address = args.scheduler
    
    if scheduler_address:
        # Start dask client on server or cluster
        client=Client(scheduler_address)

    else:
        # Start dask client on local machine. It will use all the availabe
        # cores -1

        # number of core to use
        ncores = multiprocessing.cpu_count()-1
        cluster = LocalCluster(n_workers=ncores)
        client=Client(cluster)

    # Subdirectories of the processing_directory that need to be skipped for the
    # analysis
    blocked_directories = ['_logs']

    # Starting logger
    utils.init_file_logger(processing_directory)
    logger = logging.getLogger()

    # Determine the operating system running the code
    os_windows, add_slash = utils.determine_os()

    # Check training slash in the processing directory
    processing_directory=utils.check_trailing_slash(processing_directory,os_windows)

    # Get a list of the hybridization to process
    processing_hyb_list = next(os.walk(processing_directory))[1]

    # Remove the blocked directories from the directories to process
    processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ]

    for processing_hyb in processing_hyb_list:
    
        # Determine the hyb number from the name
        hybridization_number = processing_hyb.split('_hyb')[-1]
        hybridization = 'Hybridization' + hybridization_number
        hyb_dir = processing_directory + processing_hyb + add_slash
        
        # Parse the Experimental metadata file (serial)
        experiment_infos,image_properties, hybridizations_infos, \
        converted_positions, microscope_parameters =\
        utils.experimental_metadata_parser(hyb_dir)
        
        # Parse the configuration file 
        flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir)
        
        
        # ----------------- .nd2 FILE CONVERSION ------------------------------

        # Create the temporary subdirectory tree (serial)
        tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\
                    hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash)

        # Get the list of the nd2 files to process inside the directory
        files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2')

        # Get the list of genes that are analyzed in the current hybridization
        gene_list = list(hybridizations_infos[hybridization].keys())

        # Organize the file to process in a list which order match the gene_list for
        # parallel processing
        organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f  ]
        organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f  ]

        # Each .nd2 file will be processed in a worker part of a different node
        # Get the addresses of one process/node to use for conversion
        node_addresses = utils.identify_nodes(client)
        workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()]

        # Run the conversion
        futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list,
                                    tmp_gene_dirs,processing_hyb=processing_hyb,
                                    use_ram=flt_rawcnt_config['use_ram'],
                                    max_ram=flt_rawcnt_config['max_ram'],
                                    workers=workers_conversion)
        client.gather(futures_processes)

        

        # ---------------------------------------------------------------------
        
        
        # ----------------- FILTERING AND RAW COUNTING ------------------------
        
        # Create directories 

        # Create the directory where to save the filtered images
        suffix = 'filtered_png'
        filtered_png_img_dir_path, filtered_png_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        suffix = 'filtered_npy'
        filtered_img_dir_path, filtered_img_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,
                            processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

        # Create the directory where to save the counting
        suffix = 'counting'
        counting_dir_path, counting_gene_dirs = \
            utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                            suffix,add_slash,flt_rawcnt_config['skip_tags_counting'],
                            flt_rawcnt_config['skip_genes_counting'],
                            analysis_name=flt_rawcnt_config['analysis_name'])


        if flt_rawcnt_config['illumination_correction']:

            # Create the directory where to save the counting
            suffix = 'illumination_funcs'
            illumination_func_dir_path, illumination_func_gene_dirs = \
                utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb,
                                                suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name'])

            # Loop through channels and calculate illumination
            for gene in hybridizations_infos[hybridization].keys():
                
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')

                logger.debug('Create average image for gene %s', gene)

                # Chunking the image list
                num_chunks = sum(list(client.ncores().values()))
                chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks)

                # Scatter the images sublists to process in parallel
                futures = client.scatter(chunked_list)

                # Create dask processing graph
                output = []
                for future in futures:
                    ImgMean = delayed(utils.partial_image_mean)(future)
                    output.append(ImgMean)
                ImgMean_all = delayed(sum)(output)
                ImgMean_all = ImgMean_all/float(len(futures))

                # Compute the graph
                ImgMean = ImgMean_all.compute()

                logger.debug('Create illumination function for gene %s',gene)
                # Create illumination function
                Illumination=filters.gaussian(ImgMean,sigma=(20,300,300))

                # Normalization of the illumination
                Illumination_flat=np.amax(Illumination,axis=0)
                Illumination_norm=Illumination_flat/np.amax(Illumination_flat)

                logger.debug('Save illumination function for gene %s',gene)
                # Save the illumination function
                illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0]
                illumination_fname=illumination_path+gene+'_illumination_func.npy'
                np.save(illumination_fname,Illumination_norm,allow_pickle=False)  

                # Broadcast the illumination function to all the cores
                client.scatter(Illumination_norm, broadcast=True)

                logger.debug('Filtering %s',gene)
                # Filtering and counting
                futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \
                                illumination_function=Illumination_norm,\
                                filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\
                                filtered_img_gene_dirs =filtered_img_gene_dirs,\
                                counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \
                                min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\
                                skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])
                client.gather(futures_processes)
               

        else:
            for gene in hybridizations_infos[hybridization].keys():
                flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy')
                # filtering
                logger.debug('Filtering without illumination correction %s',gene)

                futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \
                                        filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \
                                        filtered_img_gene_dirs=filtered_img_gene_dirs, \
                                        counting_gene_dirs=counting_gene_dirs, \
                                        plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\
                                        stringency=flt_rawcnt_config['stringency'],\
                                        skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting'])

                client.gather(futures_processes)
                
        # ---------------------------------------------------------------------
        
        # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------
        # # Combine the filter data in one single .ppf for each hybridization
        # # This step will run in serial mode and will not need to shuffle data
        # #  between cores because everything is on the common file system

        # logger.debug('Create .ppf.hdf5 file')

        # # Create the ppf.hdf5 file that contains the filtered data in uint16
        # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb,
        #                                 hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties)

        # logger.debug('Write the .npy filtered files into the .ppf file')
        # # Load and write the .npy tmp images into the hdf5 file

        # # open the hdf5 file
        # with h5py.File(preprocessing_file_path) as f_hdl:
        #     # Loop through each gene
        #     for gene in hybridizations_infos[hybridization].keys():

        #         logger.debug('Writing %s images in .ppf.hdf5',gene)
        #         # list of the files to transfer
        #         filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0]
        #         filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        #         # loop through the list of file
        #         for f_file in filtered_files_list:
        #             pos = f_file.split('/')[-1].split('_')[-1].split('.')[0]
        #             f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file)
        #             f_hdl.flush()
        
        # # ---------------------------------------------------------------------
        
        # # ----------------- STITCHING ------------------------
        # # Load the stitching parameters from the .yaml file

        # # Stitch the image in 2D or 3D (3D need more work/testing)
        # nr_dim = flt_rawcnt_config['nr_dim']

        # # Estimated overlapping between images according to the Nikon software
        # est_overlap = image_properties['Overlapping_percentage']

        # # Number of peaks to use for the alignment
        # nr_peaks = flt_rawcnt_config['nr_peaks']

        # # Determine if the coords need to be flipped

        # y_flip = flt_rawcnt_config['y_flip']

        # # Method to use for blending
        # # can be 'linear' or 'non linear'
        # # The methods that performs the best is the 'non linear'

        # blend = flt_rawcnt_config['blend']

        # # Reference gene for stitching
        # reference_gene = flt_rawcnt_config['reference_gene']

        # pixel_size = image_properties['PixelSize']

        # # Get the list of the filtered files of the reference gene
        # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0]
        # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy')

        # # Create pointer of the hdf5 file that will store the stitched reference image
        # # for the current hybridization
        # # Writing
        # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb
        # data_name   = (tile_file_base_name
        #                 + '_' + reference_gene
        #                 + '_stitching_data')

        # stitching_file_name = tile_file_base_name + '.sf.hdf5'
        # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest')  # replace with 'a' as soon as you fix the error


        # # Determine the tiles organization
        # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization,
        #                         est_overlap = est_overlap, y_flip = False, nr_dim = 2)



        # # Align the tiles 
        # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples,
        #                             filtered_files_list=filtered_files_list,micData=micData, 
        #                         nr_peaks=nr_peaks)

        # # Gather the futures
        # data = client.gather(futures_processes)


        # # In this case the order of the returned contingency tuples is with
        # # the order of the input contig_tuples

        # # P_all = [el for data_single in data for el in data_single[0]]
        # P_all =[data_single[0] for data_single in data ]
        # P_all = np.array(P_all)
        # P_all = P_all.flat[:]
        # covs_all = [data_single[1] for data_single in data]
        # alignment = {'P': P_all,
        #             'covs': covs_all}


        # # Calculates a shift in global coordinates for each tile (global
        # # alignment) and then applies these shifts to the  corner coordinates
        # # of each tile and returns and saves these shifted corner coordinates.
        # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples,
        #                                             micData, nr_pixels, z_count,
        #                                             alignment, data_name,
        #                                             nr_dim=nr_dim)

        # # Create the hdf5 file structure
        # stitched_group, linear_blending, blend =  hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels,
        #                                 reference_gene, blend = 'non linear')

        # # Fill the hdf5 containing the stitched image with empty data and
        # # create the blending mask
        # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64)
        # if blend is not None:
        #     # make mask
        #     stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64)
        #     tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask'])

            
        # # Create the subdirectory used to save the blended tiles
        # suffix = 'blended_tiles'
        # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Get the directory with the filtered npy images of the reference_gene to use for stitching
        # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0]


        # # Create the tmp directory where to save the masks
        # suffix = 'masks'
        # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash,
        #                                 analysis_name=flt_rawcnt_config['analysis_name'])

        # # Create and save the mask files
        # for corn_value,corner_coords in joining['corner_list']:
        #     if not(np.isnan(corner_coords[0])):
        #         cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels),
        #                             int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)]

        #         fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value)
        #         np.save(fname,cur_mask)


        # # Blend all the tiles and save them in a directory
        # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'],
        #                             stitching_files_dir = stitching_files_dir,
        #                             blended_tiles_directory = blended_tiles_directory,
        #                             masked_tiles_directory = masked_tiles_directory,
        #                             analysis_name = flt_rawcnt_config['analysis_name'],
        #                             processing_hyb = processing_hyb,reference_gene = reference_gene,
        #                             micData = micData,tiles = tiles,nr_pixels=nr_pixels,
        #                             linear_blending=linear_blending)



        # _ = client.gather(futures_processes)


        # # Write the stitched image
        # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels)

        # # close the hdf5 file
        # stitching_file.close()


        # # Delete the directories with blended tiles and masks
        # shutil.rmtree(blended_tiles_directory)
        # shutil.rmtree(masked_tiles_directory)

        # ----------------- DELETE FILES ------------------------
        # Don't delete the *.npy files here because can be used to 
        # create the final images using the apply stitching related function    









    client.close()
Пример #29
0
def do(param):
    dataset = pickle.load(open(f'{os.environ["HOME"]}/dataset.pkl', 'rb'))
    Xs, ys, Xst, yst = dataset

    criterion, n_estimators, max_features, max_depth = param
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   criterion=criterion,
                                   max_features=max_features,
                                   max_depth=max_depth)
    model.fit(Xs, ys)
    ysp = model.predict(Xst)
    acc = accuracy_score(yst, ysp)
    print(acc)
    return [acc, list(param)]


params = []
for cri in ['gini', 'entropy']:
    for n_esti in range(5, 15):
        for max_features in range(10, 20):
            for max_depth in range(4, 20):
                params.append((cri, n_esti, max_features, max_depth))
L = client.map(do, params)

ga = client.gather(L)

import json
json.dump(ga, open('ga.json', 'w'), indent=2)
print(ga)
Пример #30
0
                            help='port of the dask scheduler')
    options = arg_parser.parse_args()
    client = Client(f'{options.host}:{options.port:d}')
    if options.implementation == 'python':
        from julia_python import julia_set
    elif options.implementation == 'cython':
        from julia_cython import julia_set
        client.register_worker_callbacks(init_pyx)
    elif options.implementation == 'cython_omp':
        from julia_cython_omp import julia_set
        client.register_worker_callbacks(init_omp_pyx)
    else:
        msg = '{0} version not implemented\n'
        sys.stderr.write(msg.format(options.implementation))
        sys.exit(1)

    domain = init_julia((options.re_min, options.re_max),
                        (options.im_min, options.im_max),
                        (options.n_re, options.n_im))
    domains = np.array_split(domain, options.partitions)
    iterations = np.array_split(
        np.zeros(options.n_re * options.n_im, dtype=np.int32),
        options.partitions)
    start_time = time.time()
    futures = client.map(julia_set, domains, iterations)
    results = client.gather(futures)
    end_time = time.time()
    print('compute time = {0:.6f} s'.format(end_time - start_time))
    np.savetxt('julia.txt',
               np.concatenate(results).reshape(options.n_re, options.n_im))
Пример #31
0
    # -----------
    # monte carlo
    # -----------

    # define output file names
    OUTPUT = init_outputs()
    if CUTOFF < NSMPL:
        init_headers()
    # initialize simulation
    if RESTART:
        STATE = load_samples_restart()
        replica_exchange()
    else:
        if DASK:
            STATE = CLIENT.gather(init_samples())
        else:
            STATE = init_samples()
    # loop through to number of samples that need to be collected
    for STEP in tqdm(range(NSMPL)):
        if VERBOSE and DASK:
            client_info()
        # generate samples
        STATE[:] = gen_samples()
        # generate mc parameters
        if (STEP + 1) > CUTOFF:
            # write data
            write_outputs()
        if DASK:
            # gather results from cluster
            STATE[:] = CLIENT.gather(STATE)
Пример #32
0
###Aux channels###
##################

chunk = 16384
pad = 256

# Find the data
#cache1=find_raw_frames(ifo, st1, st1+dur)
#cache2=find_raw_frames(ifo, st2, st2+dur)

# Connect to Dask scheduler
client = Client(args.address)

for t1, t2 in chunk_segments(segs, chunk, pad):
    print 'Getting chunk', t1, t2

    # Set up the channel list
    params_list = [(chan, ifo, t1, t2) for chan in channels
                   ]  #Add in st1, st2, dur for psd comparison tool

    # Run jobs on the cluster and return results
    jobs = client.map(aux_feat_get, params_list)
    result = client.gather(jobs)

    # Write out the results
    #Will sort the results by how much difference in the PSD there is
    #result.sort(key=lambda x: x[1], reverse=True)

    with open('results_of_aux_%u-%u.dat' % (t1, (t2 - t1)), 'wb') as fout:
        pickle.dump(result, fout)
# dask client
from distributed import Client
from os.path import join
from math import ceil
from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers
from dask_configuration import dask_scheduler_url
from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func

array_list = []
file_list = list_thredds_folder(file_list_url)

# connect to dask
client = Client(dask_scheduler_url)

url_list = []
for f in file_list:
    url_list.append(data_request + "/" + data_folder + "/" + f +
                    "?time1[0],Temperature_surface[0][0:360][0:719]")

# allocate url to threads servers
server_url_mapping = compute_url_to_thredds_server_map(url_list,
                                                       thredds_servers)

# launch the dask computation and collect results
avg_results_status = client.map(compute_avg_func, server_url_mapping)
avg_results = client.gather(avg_results_status)

final_avg = np.mean(avg_results)

print(final_avg)