Пример #1
0
    def __init__(self,
                 paradigm,
                 datasets=None,
                 random_state=None,
                 n_jobs=1,
                 overwrite=False,
                 error_score='raise',
                 suffix='',
                 hdf5_path=None):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.error_score = error_score
        self.hdf5_path = hdf5_path

        # check paradigm
        if not isinstance(paradigm, BaseParadigm):
            raise (ValueError("paradigm must be an Paradigm instance"))
        self.paradigm = paradigm

        # if no dataset provided, then we get the list from the paradigm
        if datasets is None:
            datasets = self.paradigm.datasets

        if not isinstance(datasets, list):
            if isinstance(datasets, BaseDataset):
                datasets = [datasets]
            else:
                raise (ValueError("datasets must be a list or a dataset "
                                  "instance"))

        for dataset in datasets:
            if not (isinstance(dataset, BaseDataset)):
                raise (ValueError("datasets must only contains dataset "
                                  "instance"))
        rm = []
        for dataset in datasets:
            # fixme, we might want to drop dataset that are not compatible
            valid_for_paradigm = self.paradigm.is_valid(dataset)
            valid_for_eval = self.is_valid(dataset)
            if not valid_for_paradigm:
                log.warning(f"{dataset} not compatible with "
                            "paradigm. Removing this dataset from the list.")
                rm.append(dataset)
            elif not valid_for_eval:
                log.warning(f"{dataset} not compatible with evaluation. "
                            "Removing this dataset from the list.")
                rm.append(dataset)

        [datasets.remove(r) for r in rm]
        if len(datasets) > 0:
            self.datasets = datasets
        else:
            raise Exception('''No datasets left after paradigm
            and evaluation checks''')

        self.results = Results(type(self),
                               type(self.paradigm),
                               overwrite=overwrite,
                               suffix=suffix,
                               hdf5_path=self.hdf5_path)
Пример #2
0
class Test_Integration(unittest.TestCase):
    def setUp(self):
        self.obj = Results(evaluation_class=DummyEvaluation,
                           paradigm_class=DummyParadigm,
                           suffix='test')

    def tearDown(self):
        path = self.obj.filepath
        if os.path.isfile(path):
            os.remove(path)

    def test_rmanova(self):
        _in = to_result_input(['a', 'b', 'c'], [[d1] * 5, [d1] * 5, [d4] * 5])
        self.obj.add(_in, to_pipeline_dict(['a', 'b', 'c']))
        _in = to_result_input(['a', 'b', 'c'], [[d2] * 5, [d2] * 5, [d3] * 5])
        self.obj.add(_in, to_pipeline_dict(['a', 'b', 'c']))
        df = self.obj.to_dataframe()
        ma.rmANOVA(df)
Пример #3
0
    def process(self, pipelines, overwrite=False, suffix=''):
        '''
        Runs tasks on all given datasets.
        '''

        # check pipelines
        if not isinstance(pipelines, dict):
            raise (ValueError("pipelines must be a dict"))

        for _, pipeline in pipelines.items():
            if not (isinstance(pipeline, BaseEstimator)):
                raise (ValueError("pipelines must only contains Pipelines "
                                  "instance"))

        results = Results(type(self),
                          type(self.paradigm),
                          overwrite=overwrite,
                          suffix=suffix)

        for dataset in self.datasets:
            log.info('Processing dataset: {}'.format(dataset.code))
            self.preprocess_data(dataset)

            for subject in dataset.subject_list:
                # check if we already have result for this subject/pipeline
                run_pipes = results.not_yet_computed(pipelines, dataset,
                                                     subject)
                if len(run_pipes) > 0:
                    try:
                        res = self.evaluate(dataset, subject, run_pipes)
                        for pipe in res:
                            for r in res[pipe]:
                                message = '{} | '.format(pipe)
                                message += '{} | {} '.format(
                                    r['dataset'].code, r['id'])
                                message += ': Score %.3f' % r['score']
                            log.info(message)
                        results.add(res, pipelines=pipelines)
                    except Exception as e:
                        log.error(e)
                        log.debug(traceback.format_exc())
                        log.warning('Skipping subject {}'.format(subject))
        return results
Пример #4
0
    def __init__(self, paradigm, datasets=None, random_state=None, n_jobs=1,
                 overwrite=False, suffix=''):
        self.random_state = random_state
        self.n_jobs = n_jobs

        # check paradigm
        if not isinstance(paradigm, BaseParadigm):
            raise(ValueError("paradigm must be an Paradigm instance"))
        self.paradigm = paradigm

        # if no dataset provided, then we get the list from the paradigm
        if datasets is None:
            datasets = self.paradigm.datasets

        if not isinstance(datasets, list):
            if isinstance(datasets, BaseDataset):
                datasets = [datasets]
            else:
                raise(ValueError("datasets must be a list or a dataset "
                                 "instance"))

        for dataset in datasets:
            if not(isinstance(dataset, BaseDataset)):
                raise(ValueError("datasets must only contains dataset "
                                 "instance"))

        for dataset in datasets:
            # fixme, we might want to drop dataset that are not compatible
            try:
                self.paradigm.verify(dataset)
                self.verify(dataset)
            except AssertionError:
                log.warning(f"{dataset} not compatible with evaluation or "
                            "paradigm. Removing this dataset from the list.")
                datasets.remove(dataset)

        self.datasets = datasets

        self.results = Results(type(self),
                               type(self.paradigm),
                               overwrite=overwrite,
                               suffix=suffix)
Пример #5
0
class BaseEvaluation(ABC):
    '''Base class that defines necessary operations for an evaluation.
    Evaluations determine what the train and test sets are and can implement
    additional data preprocessing steps for more complicated algorithms.

    Parameters
    ----------
    paradigm : Paradigm instance
        the paradigm to use.
    datasets : List of Dataset Instance.
        The list of dataset to run the evaluation. If none, the list of
        compatible dataset will be retrieved from the paradigm instance.
    random_state:
        if not None, can guarantee same seed
    n_jobs: 1;
        number of jobs for fitting of pipeline
    overwrite: bool (defaul False)
        if true, overwrite the results.
    suffix: str
        suffix for the results file.
    '''

    def __init__(self, paradigm, datasets=None, random_state=None, n_jobs=1,
                 overwrite=False, suffix=''):
        self.random_state = random_state
        self.n_jobs = n_jobs

        # check paradigm
        if not isinstance(paradigm, BaseParadigm):
            raise(ValueError("paradigm must be an Paradigm instance"))
        self.paradigm = paradigm

        # if no dataset provided, then we get the list from the paradigm
        if datasets is None:
            datasets = self.paradigm.datasets

        if not isinstance(datasets, list):
            if isinstance(datasets, BaseDataset):
                datasets = [datasets]
            else:
                raise(ValueError("datasets must be a list or a dataset "
                                 "instance"))

        for dataset in datasets:
            if not(isinstance(dataset, BaseDataset)):
                raise(ValueError("datasets must only contains dataset "
                                 "instance"))
        rm = []
        for dataset in datasets:
            # fixme, we might want to drop dataset that are not compatible
            valid_for_paradigm = self.paradigm.is_valid(dataset)
            valid_for_eval = self.is_valid(dataset)
            if not valid_for_paradigm:
                log.warning(f"{dataset} not compatible with "
                            "paradigm. Removing this dataset from the list.")
                rm.append(dataset)
            elif not valid_for_eval:
                log.warning(f"{dataset} not compatible with evaluation. "
                            "Removing this dataset from the list.")
                rm.append(dataset)

        [datasets.remove(r) for r in rm]
        if len(datasets) > 0:
            self.datasets = datasets
        else:
            raise Exception('''No datasets left after paradigm
            and evaluation checks''')

        self.results = Results(type(self),
                               type(self.paradigm),
                               overwrite=overwrite,
                               suffix=suffix)

    def process(self, pipelines):
        '''Runs all pipelines on all datasets.

        This function will apply all provided pipelines and return a dataframe
        containing the results of the evaluation.

        Parameters
        ----------
        pipelines : dict of pipeline instance.
            A dict containing the sklearn pipeline to evaluate.

        Return
        ------
        results: pd.DataFrame
            A dataframe containing the results.

        '''

        # check pipelines
        if not isinstance(pipelines, dict):
            raise(ValueError("pipelines must be a dict"))

        for _, pipeline in pipelines.items():
            if not(isinstance(pipeline, BaseEstimator)):
                raise(ValueError("pipelines must only contains Pipelines "
                                 "instance"))

        for dataset in self.datasets:
            log.info('Processing dataset: {}'.format(dataset.code))
            results = self.evaluate(dataset, pipelines)
            for res in results:
                self.push_result(res, pipelines)

        return self.results.to_dataframe(pipelines=pipelines)

    def push_result(self, res, pipelines):
        message = '{} | '.format(res['pipeline'])
        message += '{} | {} | {}'.format(res['dataset'].code,
                                         res['subject'], res['session'])
        message += ': Score %.3f' % res['score']
        log.info(message)
        self.results.add({res['pipeline']: res}, pipelines=pipelines)

    def get_results(self):
        return self.results.to_dataframe()

    @abstractmethod
    def evaluate(self, dataset, pipelines):
        '''Evaluate results on a single dataset.

        This method return a generator. each results item is a dict with
        the following convension::

            res = {'time': Duration of the training ,
                   'dataset': dataset id,
                   'subject': subject id,
                   'session': session id,
                   'score': score,
                   'n_samples': number of training examples,
                   'n_channels': number of channel,
                   'pipeline': pipeline name}
        '''
        pass

    @abstractmethod
    def is_valid(self, dataset):
        """Verify the dataset is compatible with evaluation.
Пример #6
0
 def setUp(self):
     self.obj = Results(evaluation_class=DummyEvaluation,
                        paradigm_class=DummyParadigm,
                        suffix='test')
Пример #7
0
class Test_Results(unittest.TestCase):
    def setUp(self):
        self.obj = Results(evaluation_class=DummyEvaluation,
                           paradigm_class=DummyParadigm,
                           suffix='test')

    def tearDown(self):
        path = self.obj.filepath
        if os.path.isfile(path):
            os.remove(path)

    def testCanAddSample(self):
        self.obj.add(to_result_input(['a'], [d1]), to_pipeline_dict(['a']))

    def testRecognizesAlreadyComputed(self):
        _in = to_result_input(['a'], [d1])
        self.obj.add(_in, to_pipeline_dict(['a']))
        not_yet_computed = self.obj.not_yet_computed(to_pipeline_dict(['a']),
                                                     d1['dataset'],
                                                     d1['subject'])
        self.assertTrue(len(not_yet_computed) == 0)

    def testCanAddMultiplePipelines(self):
        _in = to_result_input(['a', 'b', 'c'], [d1, d1, d2])
        self.obj.add(_in, to_pipeline_dict(['a', 'b', 'c']))

    def testCanAddMultipleValuesPerPipeline(self):
        _in = to_result_input(['a', 'b'], [[d1, d2], [d2, d1]])
        self.obj.add(_in, to_pipeline_dict(['a', 'b']))
        not_yet_computed = self.obj.not_yet_computed(to_pipeline_dict(['a']),
                                                     d1['dataset'],
                                                     d1['subject'])
        self.assertTrue(len(not_yet_computed) == 0, not_yet_computed)
        not_yet_computed = self.obj.not_yet_computed(to_pipeline_dict(['b']),
                                                     d2['dataset'],
                                                     d2['subject'])
        self.assertTrue(len(not_yet_computed) == 0, not_yet_computed)
        not_yet_computed = self.obj.not_yet_computed(to_pipeline_dict(['b']),
                                                     d1['dataset'],
                                                     d1['subject'])
        self.assertTrue(len(not_yet_computed) == 0, not_yet_computed)

    def testCanExportToDataframe(self):
        _in = to_result_input(['a', 'b', 'c'], [d1, d1, d2])
        self.obj.add(_in, to_pipeline_dict(['a', 'b', 'c']))
        _in = to_result_input(['a', 'b', 'c'], [d2, d2, d3])
        self.obj.add(_in, to_pipeline_dict(['a', 'b', 'c']))
        df = self.obj.to_dataframe()
        self.assertTrue(
            set(np.unique(df['pipeline'])) == set(('a', 'b', 'c')),
            np.unique(df['pipeline']))
        self.assertTrue(df.shape[0] == 6, df.shape[0])
Пример #8
0
class BaseEvaluation(ABC):
    """Base class that defines necessary operations for an evaluation.
    Evaluations determine what the train and test sets are and can implement
    additional data preprocessing steps for more complicated algorithms.

    Parameters
    ----------
    paradigm : Paradigm instance
        The paradigm to use.
    datasets : List of Dataset instance
        The list of dataset to run the evaluation. If none, the list of
        compatible dataset will be retrieved from the paradigm instance.
    random_state: int, RandomState instance, default=None
        If not None, can guarantee same seed for shuffling examples.
    n_jobs: int, default=1
        Number of jobs for fitting of pipeline.
    overwrite: bool, default=False
        If true, overwrite the results.
    error_score: "raise" or numeric, default="raise"
        Value to assign to the score if an error occurs in estimator fitting. If set to
        ‘raise’, the error is raised.
    suffix: str
        Suffix for the results file.
    hdf5_path: str
        Specific path for storing the results.
    additional_columns: None
        Adding information to results.
    return_epochs: bool, default=False
        use MNE epoch to train pipelines.
    mne_labels: bool, default=False
        if returning MNE epoch, use original dataset label if True
    """

    def __init__(
        self,
        paradigm,
        datasets=None,
        random_state=None,
        n_jobs=1,
        overwrite=False,
        error_score="raise",
        suffix="",
        hdf5_path=None,
        additional_columns=None,
        return_epochs=False,
        mne_labels=False,
    ):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.error_score = error_score
        self.hdf5_path = hdf5_path
        self.return_epochs = return_epochs
        self.mne_labels = mne_labels

        # check paradigm
        if not isinstance(paradigm, BaseParadigm):
            raise (ValueError("paradigm must be an Paradigm instance"))
        self.paradigm = paradigm

        # check labels
        if self.mne_labels and not self.return_epochs:
            raise (ValueError("mne_labels could only be set with return_epochs"))

        # if no dataset provided, then we get the list from the paradigm
        if datasets is None:
            datasets = self.paradigm.datasets

        if not isinstance(datasets, list):
            if isinstance(datasets, BaseDataset):
                datasets = [datasets]
            else:
                raise (ValueError("datasets must be a list or a dataset " "instance"))

        for dataset in datasets:
            if not (isinstance(dataset, BaseDataset)):
                raise (ValueError("datasets must only contains dataset " "instance"))
        rm = []
        for dataset in datasets:
            # fixme, we might want to drop dataset that are not compatible
            valid_for_paradigm = self.paradigm.is_valid(dataset)
            valid_for_eval = self.is_valid(dataset)
            if not valid_for_paradigm:
                log.warning(
                    f"{dataset} not compatible with "
                    "paradigm. Removing this dataset from the list."
                )
                rm.append(dataset)
            elif not valid_for_eval:
                log.warning(
                    f"{dataset} not compatible with evaluation. "
                    "Removing this dataset from the list."
                )
                rm.append(dataset)

        [datasets.remove(r) for r in rm]
        if len(datasets) > 0:
            self.datasets = datasets
        else:
            raise Exception(
                """No datasets left after paradigm
            and evaluation checks"""
            )

        self.results = Results(
            type(self),
            type(self.paradigm),
            overwrite=overwrite,
            suffix=suffix,
            hdf5_path=self.hdf5_path,
            additional_columns=additional_columns,
        )

    def process(self, pipelines):
        """Runs all pipelines on all datasets.

        This function will apply all provided pipelines and return a dataframe
        containing the results of the evaluation.

        Parameters
        ----------
        pipelines : dict of pipeline instance.
            A dict containing the sklearn pipeline to evaluate.

        Returns
        -------
        results: pd.DataFrame
            A dataframe containing the results.

        """

        # check pipelines
        if not isinstance(pipelines, dict):
            raise (ValueError("pipelines must be a dict"))

        for _, pipeline in pipelines.items():
            if not (isinstance(pipeline, BaseEstimator)):
                raise (ValueError("pipelines must only contains Pipelines " "instance"))

        for dataset in self.datasets:
            log.info("Processing dataset: {}".format(dataset.code))
            results = self.evaluate(dataset, pipelines)
            for res in results:
                self.push_result(res, pipelines)

        return self.results.to_dataframe(pipelines=pipelines)

    def push_result(self, res, pipelines):
        message = "{} | ".format(res["pipeline"])
        message += "{} | {} | {}".format(
            res["dataset"].code, res["subject"], res["session"]
        )
        message += ": Score %.3f" % res["score"]
        log.info(message)
        self.results.add({res["pipeline"]: res}, pipelines=pipelines)

    def get_results(self):
        return self.results.to_dataframe()

    @abstractmethod
    def evaluate(self, dataset, pipelines):
        """Evaluate results on a single dataset.

        This method return a generator. each results item is a dict with
        the following convension::

            res = {'time': Duration of the training ,
                   'dataset': dataset id,
                   'subject': subject id,
                   'session': session id,
                   'score': score,
                   'n_samples': number of training examples,
                   'n_channels': number of channel,
                   'pipeline': pipeline name}
        """
        pass

    @abstractmethod
    def is_valid(self, dataset):
        """Verify the dataset is compatible with evaluation.
Пример #9
0
    def __init__(
        self,
        paradigm,
        datasets=None,
        random_state=None,
        n_jobs=1,
        overwrite=False,
        error_score="raise",
        suffix="",
        hdf5_path=None,
        additional_columns=None,
        return_epochs=False,
        mne_labels=False,
    ):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.error_score = error_score
        self.hdf5_path = hdf5_path
        self.return_epochs = return_epochs
        self.mne_labels = mne_labels

        # check paradigm
        if not isinstance(paradigm, BaseParadigm):
            raise (ValueError("paradigm must be an Paradigm instance"))
        self.paradigm = paradigm

        # check labels
        if self.mne_labels and not self.return_epochs:
            raise (
                ValueError("mne_labels could only be set with return_epochs"))

        # if no dataset provided, then we get the list from the paradigm
        if datasets is None:
            datasets = self.paradigm.datasets

        if not isinstance(datasets, list):
            if isinstance(datasets, BaseDataset):
                datasets = [datasets]
            else:
                raise (ValueError("datasets must be a list or a dataset "
                                  "instance"))

        for dataset in datasets:
            if not (isinstance(dataset, BaseDataset)):
                raise (ValueError("datasets must only contains dataset "
                                  "instance"))
        rm = []
        for dataset in datasets:
            valid_for_paradigm = self.paradigm.is_valid(dataset)
            valid_for_eval = self.is_valid(dataset)
            if not valid_for_paradigm:
                log.warning(f"{dataset} not compatible with "
                            "paradigm. Removing this dataset from the list.")
                rm.append(dataset)
            elif not valid_for_eval:
                log.warning(f"{dataset} not compatible with evaluation. "
                            "Removing this dataset from the list.")
                rm.append(dataset)

        [datasets.remove(r) for r in rm]
        if len(datasets) > 0:
            self.datasets = datasets
        else:
            raise Exception("""No datasets left after paradigm
            and evaluation checks""")

        self.results = Results(
            type(self),
            type(self.paradigm),
            overwrite=overwrite,
            suffix=suffix,
            hdf5_path=self.hdf5_path,
            additional_columns=additional_columns,
        )
Пример #10
0
 def setUp(self):
     self.obj = Results(evaluation_class=type(
         DummyEvaluation(DummyParadigm())),
                        paradigm_class=type(DummyParadigm()),
                        suffix='test')