예제 #1
0
    async def test_should_correctly_init_parallel_pipeline_with_hooks_in_dict(
            self, ray_context):

        self.prepare_basic_hooks_test_modules_factory()

        callable_1_counter = Queue()
        callable_2_counter = Queue()

        def callable_1():
            callable_1_counter.put(1)

        def callable_2():
            callable_2_counter.put(2)

        config_file = self.get_config_file(
            'correct_exemplary_config_with_groups.yaml')

        with open(config_file) as config:
            config = config.read()
            pipeline = await ConfigReader.read(
                config,
                ModuleFactory,
                after_created={
                    "g1": [callable_1, callable_2, callable_2],
                    "g2": [callable_1],
                    "g3": []
                })

        assert len(pipeline.modules) == 3
        assert callable_1_counter.qsize() == 3
        assert callable_2_counter.qsize() == 4
예제 #2
0
def test_qsize(ray_start_regular_shared):

    q = Queue()

    items = list(range(10))
    size = 0

    assert q.qsize() == size

    for item in items:
        q.put(item)
        size += 1
        assert q.qsize() == size

    for item in items:
        assert q.get() == item
        size -= 1
        assert q.qsize() == size
예제 #3
0
    async def test_should_call_hooks_in_groups(self, ray_context):
        builder = ParallelPipeline()

        callable_1_counter = Queue()
        callable_2_counter = Queue()

        def callable_1():
            callable_1_counter.put(1)

        def callable_2():
            callable_2_counter.put(2)

        builder.add_group(builder.Group('g1', after_created=[callable_1, callable_2]))
        builder.add_group(builder.Group('g2', after_created=[callable_2]))
        builder.add_module(ModuleA('m1', group='g1'))
        builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1')))
        pipeline = await builder.build()
        assert isinstance(pipeline, ParallelPipeline.Runtime)
        assert len(pipeline.groups) == 2
        assert set([g.name for g in pipeline.groups]) == {'g1', 'g2'}
        assert callable_1_counter.qsize() == 1
        assert callable_2_counter.qsize() == 2
예제 #4
0
class RayHandler:
    def __init__(self,
                 fc_data,
                 behav_data,
                 behav,
                 covars,
                 n_perm=0,
                 **ray_kwargs):
        self.behav_data = behav_data  # For adding kfold_indices

        ray.shutdown()  # Make sure Ray is only initialised once
        self.ray_info = ray.init(**ray_kwargs)

        self.in_queue = Queue()
        self.out_queue = Queue()
        self.status_queue = Queue()
        self.report_queue = Queue()

        self.status_dict = {}
        self.actors_list = []

        # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example
        self.fselection_results = {}
        self.fselection_results[-1] = {
        }  # Create sub-dictionary for original (i.e. non-permuted) data
        self.prediction_results = {}

        self.data_dict = {}
        self.data_dict['behav'] = behav
        self.data_dict['covars'] = covars
        self.data_dict['n_perm'] = n_perm
        self.data_dict['data'] = fc_data
        self.data_dict['edges'] = self.data_dict['data'].columns.astype(
            str)  # Save edges columns before adding behavioral columns
        # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame
        if covars:
            self.data_dict['data'][covars] = behav_data[covars]
        if n_perm > 0:
            # It seems to be more efficient to create a separate df and concat later;
            # .to_frame() converts Pandas series into a DataFrame on-the-fly
            behav_df = behav_data[behav].to_frame()
            for perm in range(n_perm):
                behav_df["{}-perm-{}".format(
                    behav, perm)] = np.random.permutation(behav_df[behav])
                self.fselection_results[perm] = {
                }  # Create sub-dictionaries to keep fselection results for permutations
            behav_df = behav_df.copy()
            # To avaid fragmentation (and the corresponding warning), consolidate into a
            # new DataFrame)
            self.data_dict['data'] = pd.concat(
                [self.data_dict['data'], behav_df], axis=1)
        else:
            self.data_dict['data'][behav] = behav_data[behav]
        self.data_dict['data'].columns = self.data_dict['data'].columns.astype(
            str)

    def add_kfold_indices(self, n_folds, clean=True):
        subject_ids = self.data_dict['data'].index
        kfold_indices = get_kfold_indices(subject_ids, n_folds)
        if clean:
            kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data)
        self.data_dict['kfold_indices'] = kfold_indices
        printv("You need to (re-) upload data after this operation.")

    def upload_data(self):
        # Allows us to manipulate data in-class before uploading
        # TODO: Put this and start_workers function in __init__() again? -> No, permutation
        # and post-festum data manipulation!
        self.data_object = ray.put(self.data_dict)

    def start_workers(self, n_workers):
        printv("Starting {} workers".format(n_workers))
        self.workers = [
            RayWorker.remote(self.data_object, self.in_queue, self.out_queue,
                             self.status_queue) for _ in range(n_workers)
        ]

    def start_actors(self):
        qsize = self.in_queue.qsize()
        printv("Starting actors for {} jobs...".format(qsize))
        self.actors = [
            RayActor.remote(self.data_object, self.in_queue, self.out_queue,
                            self.status_queue) for _ in range(qsize)
        ]

    def start_fselection(self, train_subs, fold, perm):  # OUTDATED
        actor = RayActor.remote(self.data_object,
                                self.in_queue,
                                self.out_queue,
                                self.status_queue,
                                auto_start=False)
        object = actor.edgewise_pcorr.remote(train_subs, fold,
                                             perm)  # We don't need to keep
        # the object as results are sent to out_queue
        self.actors_list.append(actor)

    def submit_fselection(self, train_subs, fold, perm=-1):
        # perm=-1 means original data and is the default
        self.in_queue.put(['fselection', train_subs, fold, perm])

    def submit_prediction(self,
                          mask,
                          kfold_indices_train,
                          kfold_indices_test,
                          fold,
                          perm=-1):
        self.in_queue.put([
            'prediction', mask, kfold_indices_train, kfold_indices_test, fold,
            perm
        ])

    def get_results(self, queue, n=100):
        """
      Common get function utilised by get_{prediction,fselection}_results
      Input: queue to get from, max number of items to get at once
      Output: combined results
      """
        N_total = 0
        results = []
        while not queue.empty():
            N = queue.qsize()
            if N_total < N:
                N_total = N
            if N < n:  # To provide some sort of progress display, it makes sense to split
                n = N
            printv("Retrieving results: {} of {}".format(
                len(results) + n, N_total),
                   update=True)
            items = queue.get_nowait_batch(n)
            for item in items:
                results.append(item)
        return results

    def get_fselection_results(self):
        results = self.get_results(self.out_queue)
        n = 1
        N = len(results)
        printv("\n")
        for result in results:
            fold = result[0]
            perm = result[1]
            df = result[2]
            printv("Rearranging result {} of {}".format(n, N), update=True)
            self.fselection_results[perm][fold] = df
            n += 1
        #return self.fselection_results

    def get_prediction_results(self):
        results = self.get_results(self.out_queue)
        for results_dict in results:
            if results_dict['perm'] not in self.prediction_results:
                self.prediction_results[results_dict['perm']] = pd.DataFrame()
                self.prediction_results[
                    results_dict['perm']]['observed'] = self.data_dict['data'][
                        self.data_dict['behav']]
            for tail in ('pos', 'neg', 'glm'):
                self.prediction_results[results_dict['perm']].loc[
                    results_dict['test_IDs'], [tail]] = results_dict[tail]
        return self.prediction_results

    def status(self, verbose=True):
        N = self.status_queue.size()
        status_list_list = self.status_queue.get_nowait_batch(N)
        printv("Retrieving {} items from status queue...".format(N))
        for status_list in status_list_list:
            pid = status_list[0]
            node = status_list[1]
            msg = status_list[2]
            self.status_dict[pid] = {"msg": msg, "node": node}
        n = 1
        for pid, info in self.status_dict.items():
            if (info['msg']):  # Only print alive actors (-> msg != None)
                print("Actor {} [{}@{}]: {}".format(n, pid, info['node'],
                                                    info['msg']))
                n += 1
        print("\n")
        out_size = self.out_queue.qsize()
        in_size = self.in_queue.qsize()
        print("Jobs done: {}".format(out_size))
        print("Jobs remaining in queue: {}".format(in_size))

        return out_size, in_size

    def terminate(self):
        ray.shutdown()