Exemplo n.º 1
0
def run_indri(args, output, overwrite_threads=False):
    from subprocess import Popen, PIPE
    import os

    cancel = Variable('cancel', get_client())
    if cancel.get():
        return ('canceled', get_worker().address, 0, get_loadinfo())

    start = time.time()
    if overwrite_threads:
        processes = len(os.sched_getaffinity(0)) - 1
        args = (args[0], '-threads={}'.format(processes), *args[1:])

    with Popen(args, stdout=PIPE, stderr=PIPE) as proc:
        content = []
        for l in proc.stdout:
            content.append(l)
            if len(content) % 1000 != 0:
                continue
            if cancel.get():
                proc.kill()
                return ('killed', get_worker().address, time.time() - start,
                        get_loadinfo())

    with open(output, 'wb') as f:
        f.writelines(content)

    return ('completed', get_worker().address, time.time() - start,
            get_loadinfo())
Exemplo n.º 2
0
class ClusterShareMemory(ShareMemory):
    """Share Memory for dask cluster."""
    def __init__(self, name):
        from dask.distributed import Variable
        self.var = Variable(name, client=ShareMemoryClient().client)

    def put(self, value):
        """Put value into shared data."""
        self.var.set(str(value))

    def get(self):
        """Get value from shared data."""
        # TODO: block issue when var no data.
        return ast.literal_eval(self.var.get(timeout=2))

    def delete(self):
        """Delete data according to name."""
        self.var.delete()

    def close(self):
        """Close Share Memory."""
        ShareMemoryClient().close()
Exemplo n.º 3
0
def run_indri_cluster(scheduler, indri, params, runs, overwrite):
    client = Client(scheduler)
    available_workers = get_worker_load(client)
    ntasks = len(params)
    for w in available_workers:
        logging.info('{:<27} {:<22}'.format(w[0], format_loadavg(w[1:])))
    logging.info('{} tasks in total'.format(len(params)))
    logging.info('{} workers in total'.format(len(available_workers)))

    cancel = Variable('cancel', client)
    cancel.set(False)

    def signal_handler(sig, frame):
        cancel.set(True)
        logging.info(
            'CTRL-C received. It may take a while to kill running tasks.')

    signal.signal(signal.SIGINT, signal_handler)

    indri_args = [(str(indri.resolve()), str(p.resolve())) for p in params]
    fp_runs = [str(r.resolve()) for r in runs]
    overwrite = [overwrite] * len(runs)
    schedule_loop(client, ntasks, cancel, runs, indri_args, fp_runs, overwrite)
Exemplo n.º 4
0
 def __init__(self, Client, Ssize, rank, arrays, deisa_arrays_dtype):
     self.client = Client
     self.rank = rank
     listw = Variable("workers").get()
     if Ssize > len(listw):  # more processes than workers
         self.workers = [listw[rank % len(listw)]]
     else:
         k = len(listw) // Ssize  # more workers than processes
         self.workers = listw[rank * k:rank * k + k]
     self.arrays = arrays
     for ele in self.arrays:
         self.arrays[ele]["dtype"] = str(deisa_arrays_dtype[ele])
         self.arrays[ele]["timedim"] = self.arrays[ele]["timedim"][0]
         self.position = [
             self.arrays[ele]["starts"][i] //
             self.arrays[ele]["subsizes"][i]
             for i in range(len(np.array(self.arrays[ele]["sizes"])))
         ]
     if rank == 0:
         Queue("Arrays").put(
             self.arrays
         )  # If and only if I have a perfect domain decomposition
Exemplo n.º 5
0
 def __init__(self, Sworker, scheduler_info):
     with open(scheduler_info) as f:
         s = json.load(f)
     self.adr = s["address"]
     self.client = Client(self.adr, serializers=[
         'dask', 'pickle'
     ])  # msgpack pour grand message ne serialize pas
     dask.config.set({
         "distributed.deploy.lost-worker-timeout": 60,
         "distributed.workers.memory.spill": 0.97,
         "distributed.workers.memory.target": 0.95,
         "distributed.workers.memory.terminate": 0.99
     })
     self.workers = [
         comm.get_address_host_port(i, strict=False)
         for i in self.client.scheduler_info()["workers"].keys()
     ]
     while (len(self.workers) != Sworker):
         self.workers = [
             comm.get_address_host_port(i, strict=False)
             for i in self.client.scheduler_info()["workers"].keys()
         ]
     Variable("workers").set(self.workers)
Exemplo n.º 6
0
def run_test_with_timeout(
    test_config: TestConfig,
    incoming_state: dict,
    hostnames: List[str],
    duration: int = 15,
) -> dict:
    """
    Calls run_test with a timeout and signals run_test to end gracefully if timeout has completed

    Args:
        test_config: Config of test to run
        incoming_state: Initial state to run actions/asserts in
        hostnames: List of runner hostnames
        duration: Optional timeout to run test within (I suppose this is to make it convenient to call in runners)

    Returns:
        New state after running actions and asserts
    """
    if duration is None or duration < 0:
        return run_test(test_config, incoming_state, hostnames)

    # NOTE: Use a dask cluster scheduler?
    client = get_client()

    # NOTE: may improve way of doing this
    timeout_signal_name = f"keep-going-{str(uuid.uuid4())}"
    keep_going = Variable(timeout_signal_name)
    keep_going.set(True)

    run_test_task: Future = client.submit(
        run_test,
        test_config=test_config,
        incoming_state=incoming_state,
        hostnames=hostnames,
        timeout_signal_name=timeout_signal_name,
    )

    LOGGER.debug("Test duration config: %d seconds", duration)

    def distributed_timeout():
        # If a timeout from a previous test did not complete, it will keep running (it cannot be canceled)
        # However, if it keeps running, it can end another test early
        # This means it needs to receive a signal to return
        end_time = datetime.now() + timedelta(seconds=duration)
        while datetime.now() <= end_time and keep_going.get():
            time.sleep(test_config.get("secondsBetweenCycles", 1))

    timeout_task: Future = client.submit(distributed_timeout)

    # Wait for either test or timeout to finish
    # Return test result if it finishes first
    # End test if timeout finishes first and return state
    start = datetime.now()
    wait([run_test_task, timeout_task], return_when="FIRST_COMPLETED")
    end = datetime.now()

    LOGGER.debug("Test %s took %d seconds", test_config["name"], (end - start).seconds)

    if run_test_task.done():
        keep_going.set(False)
        return run_test_task.result()
    elif timeout_task.done():
        LOGGER.debug(timeout_task)
        LOGGER.info("Test %s timed out", test_config["name"])
        # NOTE: add timed out to summary?
        keep_going.set(False)
        return run_test_task.result()
Exemplo n.º 7
0
def run_test(
    test_config: TestConfig,
    incoming_state: dict,
    hostnames: List[str],
    timeout_signal_name: str = None,
) -> dict:
    """
    Runs actions and asserts in provided test and returns new state with finished actions/asserts

    Args:
        test_config: test configuration to run
        incoming_state: Initial state of test (does not modify)
        hostnames: Addresses of runners to run actions/asserts on
        timeout_signal_name: Optional Dask variable to check if test has timed out so it can end gracefully

    Returns:
        New state after running actions and asserts
    """
    actions = test_config.get("actions", [])
    asserts = test_config.get("asserts", [])

    default_cycles = get_default_cycles(actions, asserts)

    remaining_cycles = test_config.get("cycles", default_cycles)
    completed_cycles = 0
    # NOTE: possibly use infinite default dict
    state = defaultdict(dict, incoming_state)

    # Validate test before running
    action_names = []
    assert_names = []

    for action in actions:
        assert (
            "type" in action
        ), f"Action in test '{test_config['name']}' is missing property 'type'"

        action_name = action.get("name")

        if action_name is None:
            action_name = create_item_name(action["type"], action_names)

        # NOTE: sets action name if not set
        action["name"] = action_name
        action_names.append(action_name)

    for asrt in asserts:
        assert (
            "type" in asrt
        ), f"Assert in test '{test_config['name']}' is missing property 'type'"

        assert_name = asrt.get("name")

        if assert_name is None:
            assert_name = create_item_name(asrt["type"], assert_names)

        # NOTE: sets assert name if not set
        asrt["name"] = assert_name
        assert_names.append(assert_name)

    assert hostnames, "Must have at least one host to run tests"
    assert len(set(action_names)) == len(
        action_names
    ), "Action names if specified must be unique"
    assert len(set(assert_names)) == len(
        assert_names
    ), "Assert names if specified must be unique"

    start_time = datetime.now()

    # stop if remaining_cycles == 0 or had asserts and no asserts remain
    while continue_running(
        asserts, remaining_cycles, state[test_config["name"]].get("asserts", {})
    ):
        # Check if running with a timeout and break if timeout has signaled
        if timeout_signal_name is not None:
            keep_going = Variable(timeout_signal_name, client=get_client())

            if not keep_going.get():
                break

        # NOTE: exceptions thrown in actions/asserts cause rest of test to exit
        action_distribution_strategy = test_config.get(
            "actionDistributionStrategy", "parallel"
        )

        if actions:
            assert action_distribution_strategy in [
                "parallel",
                "series",
            ], f"actionDistributionStrategy must be 'parallel' or 'series', got '{action_distribution_strategy}'"

            if action_distribution_strategy == "series":
                run_actions_func = run_actions_series
            else:
                run_actions_func = run_actions_parallel

            state[test_config["name"]]["actions"] = run_actions_func(
                actions,
                state,
                test_config["name"],
                hostnames,
                test_config.get("secondsBetweenActions", 0),
            )

        assert_distribution_strategy = test_config.get(
            "assertDistributionStrategy", "series"
        )

        if asserts:
            assert assert_distribution_strategy in [
                "parallel",
                "series",
            ], f"assertDistributionStrategy must be 'parallel' or 'series', got '{assert_distribution_strategy}'"

            if assert_distribution_strategy == "parallel":
                run_asserts_func = run_asserts_parallel
            else:
                run_asserts_func = run_asserts_series

            state[test_config["name"]]["asserts"] = run_asserts_func(
                asserts,
                state,
                test_config["name"],
                hostnames,
                test_config.get("secondsBetweenAsserts", 0),
            )

        remaining_cycles -= 1
        completed_cycles += 1

        # Wait between cycles if test is to continue running
        if continue_running(
            asserts, remaining_cycles, state[test_config["name"]].get("asserts", {})
        ):
            time.sleep(test_config.get("secondsBetweenCycles", 1))

    remaining_asserts = get_remaining_asserts(
        asserts, state[test_config["name"]].get("asserts", {})
    )

    state[test_config["name"]]["summary"] = TestSummary(
        description=test_config.get("description"),
        completed_cycles=completed_cycles,
        remaining_asserts=[asrt["name"] for asrt in remaining_asserts],
        error=None,
        duration=(datetime.now() - start_time).seconds,
    )

    return state
        brake.set(False)
        return None

    #######start the sankof algo here #######################
    print('starting sankof')
    #scale cluster
    #scatter the blank tree and row index for each process
    #remote_tree = client.scatter(tree)

    remote_index = client.scatter(IDindex)

    inq = Queue('inq')
    outq = Queue('outq')
    lock = Lock('x')

    stopiter = Variable(False)
    brake = Variable(True)


    saver_started = False
    workers_started = False

    #start workers
    for workers in range(NCORE*ncpu ):
        w = client.submit(  calculate_small_parsimony , inq= None ,outq = None  ,stopiter= stopiter ,  treefile=treefile , bootstrap_replicates = bootstrap_replicates,
        matfile= alnfile+'.h5' ,  row_index= remote_index , iolock = lock, verbose  = False  )
        fire_and_forget(w)

    s = client.submit(  collect_futures , queue= None , stopiter=stopiter , brake = brake, runName= runName , nucleotides_only =False  )
    saver_started = True
    fire_and_forget(s)
Exemplo n.º 9
0
    v = np.zeros(P)
    max_iterations = P * 10

    # In cluster mode, pass the address:port of the Scheduler 
    client = Client()
    
    # Read a text file into a Dask bag ~ Spark RDD
    input = db.read_text(file_s)

    #Apparently in Bags, you need another Bag which has the indices to zip with. Hence, creating indices for our data
    l = db.from_sequence(range(input.count()), npartitions = 1)

    S = input_to_rowmatrix(input, l, True)

    #Global/Broadcast Variables
    _U_ = Variable('_U_')
    _UU_ = Variable('_UU_')
    _I_ = Variable('_I_')
    _VI_ = Variable('_VI_')
    
    file_D = os.path.join(args['dictionary'], "{}_D.txt".format(args["prefix"]))
    file_z = os.path.join(args['output'], "{}_z.txt".format(args["prefix"]))

    #Start the loop!
    for m in range(M):
        print ('M: '+str(m))
        seed = np.random.randint(max_iterations + 1, high = 4294967295)
        np.random.seed(seed)
        u_old = np.random.random(T)
        num_iterations = 0
        delta = 2 * epsilon
Exemplo n.º 10
0
 def __init__(self, name):
     from dask.distributed import Variable
     self.var = Variable(name, client=ShareMemoryClient().client)