예제 #1
0
 def __init__(self,
              filename,
              X=None,
              topo_view=None,
              y=None,
              load_all=False,
              **kwargs):
     if 'preprocessor' in kwargs:
         if ('fit_preprocessor' in kwargs and
                 kwargs['fit_preprocessor'] is False) or ('fit_preprocessor'
                                                          not in kwargs):
             self._preprocessor = kwargs['preprocessor']
             kwargs['preprocessor'] = None
     else:
         self._preprocessor = None
     self.load_all = load_all
     if h5py is None:
         raise RuntimeError("Could not import h5py.")
     self._file = h5py.File(filename)
     if X is not None:
         X = self.get_dataset(X, load_all)
     if topo_view is not None:
         topo_view = self.get_dataset(topo_view, load_all)
     if y is not None:
         y = self.get_dataset(y, load_all)
     DenseDesignMatrix.__init__(self,
                                X=X,
                                topo_view=topo_view,
                                y=y,
                                **kwargs)
예제 #2
0
 def __init__(self, which_set, data_path=None, 
              term_range=None, target_type='cluster100'):
     """
     which_set: a string specifying which portion of the dataset
         to load. Valid values are 'train', 'valid' or 'test'
     data_path: a string specifying the directory containing the 
         webcluster data. If None (default), use environment 
         variable WEBCLUSTER_DATA_PATH.
     term_range: a tuple for taking only a slice of the available
         terms. Default is to use all 6275. For example, an input
         range of (10,2000) will truncate the 10 most frequent terms
         and the 6275-2000=4275 les frequent terms, whereby frequency
         we mean how many unique documents each term is in.
     target_type: the type of targets to use. Valid options are 
         'cluster[10,100,1000]'
     """
     self.__dict__.update(locals())
     del self.self
     
     self.corpus_terms = None
     self.doc_info = None
     
     print "loading WebCluster DDM. which_set =", self.which_set
     
     if self.data_path is None:
         self.data_path \
             = string_utils.preprocess('${WEBCLUSTER_DATA_PATH}')
     
     fname = os.path.join(self.data_path, which_set+'_doc_inputs.npy')
     X = np.load(fname)
     if self.term_range is not None:
         X = X[:,self.term_range[0]:self.term_range[1]]
         X = X/X.sum(1).reshape(X.shape[0],1)
     print X.sum(1).mean()
     
     fname = os.path.join(self.data_path, which_set+'_doc_targets.npy')
     # columns: 0:cluster10s, 1:cluster100s, 2:cluster1000s
     self.cluster_hierarchy = np.load(fname)
     
     y = None
     if self.target_type == 'cluster10':
         y = self.cluster_hierarchy[:,0]
     elif self.target_type == 'cluster100':
         y = self.cluster_hierarchy[:,1]
     elif self.target_type == 'cluster1000':
         y = self.cluster_hierarchy[:,2]
     elif self.target_type is None:
         pass
     else:
         raise NotImplementedError()
     
     DenseDesignMatrix.__init__(self, X=X, y=y)
     
     print "... WebCluster ddm loaded"
예제 #3
0
 def __init__(self, filename, X=None, topo_view=None, y=None,
              load_all=False, **kwargs):
     if 'preprocessor' in kwargs:
         if ('fit_preprocessor' in kwargs and 
             kwargs['fit_preprocessor'] is False) or ('fit_preprocessor' 
                                                      not in kwargs):
             self._preprocessor = kwargs['preprocessor']
             kwargs['preprocessor'] = None
     else:
         self._preprocessor = None
     self.load_all = load_all
     if h5py is None:
         raise RuntimeError("Could not import h5py.")
     self._file = h5py.File(filename)
     if X is not None:
         X = self.get_dataset(X, load_all)
     if topo_view is not None:
         topo_view = self.get_dataset(topo_view, load_all)
     if y is not None:
         y = self.get_dataset(y, load_all)
     DenseDesignMatrix.__init__(self, X=X, topo_view=topo_view, y=y,
                                **kwargs)
예제 #4
0
    def __init__(self,
                 patient_id,
                 which_set,
                 list_features,
                 leave_out_seizure_idx_valid,
                 leave_out_seizure_idx_test,
                 data_dir,
                 preictal_sec,
                 use_all_nonictals,
                 preprocessor_dir,
                 n_selected_features=-1,
                 batch_size=None,
                 balance_class=True,
                 axes=('b', 0, 1, 'c'),
                 default_seed=0):

        self.balance_class = balance_class
        self.batch_size = batch_size

        tmp_list_features = np.empty(len(list_features), dtype=object)
        for f_idx in range(len(list_features)):
            tmp_list_features[f_idx] = FeatureList.get_info(
                list_features[f_idx])
        list_features = tmp_list_features

        print 'List of features:'
        for f in list_features:
            print f['feature'] + '.' + f['param']
        print ''

        EpilepsiaeFeatureLoader.__init__(
            self,
            patient_id=patient_id,
            which_set=which_set,
            list_features=list_features,
            leave_out_seizure_idx_valid=leave_out_seizure_idx_valid,
            leave_out_seizure_idx_test=leave_out_seizure_idx_test,
            data_dir=data_dir,
            preictal_sec=preictal_sec,
            use_all_nonictals=use_all_nonictals)
        # Row: samples, Col: features
        raw_X, y = self.load_data()

        if n_selected_features != -1:
            all_rank_df = None
            for f_idx, feature in enumerate(self.list_features):
                rank_df = pd.read_csv(
                    os.path.join(
                        data_dir,
                        patient_id + '/rank_feature_idx_' + feature['param'] +
                        '_' + 'leaveout_' + str(leave_out_seizure_idx_valid) +
                        '_' + str(leave_out_seizure_idx_test) + '.txt'))
                if f_idx == 0:
                    all_rank_df = rank_df
                else:
                    offset_f_idx = 0
                    for i in range(f_idx):
                        offset_f_idx = offset_f_idx + self.list_features[i][
                            'n_features']
                    rank_df['feature_idx'] = rank_df[
                        'feature_idx'].values + offset_f_idx
                    all_rank_df = pd.concat([all_rank_df, rank_df])

            sorted_feature_df = all_rank_df.sort(['D_ADH'], ascending=[0])
            self.selected_feature_idx = sorted_feature_df[
                'feature_idx'][:n_selected_features]
            raw_X = raw_X[:, self.selected_feature_idx]
        else:
            self.selected_feature_idx = np.arange(raw_X.shape[1])

        # Print shape of input data
        print '------------------------------'
        print 'Dataset: {0}'.format(self.which_set)
        print 'Number of samples: {0}'.format(raw_X.shape[0])
        print ' Preictal samples: {0}'.format(self.preictal_samples)
        print ' Nonictal samples: {0}'.format(self.nonictal_samples)
        print ' NaN samples: {0}'.format(self.nan_non_flat_samples)
        print ' Note for ' 'train' ' and ' 'valid_train' ': number of samples will be equal without removing the nan samples.'
        print 'Number of features: {0}'.format(raw_X.shape[1])
        print '------------------------------'

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler = scaler.fit(raw_X)

            with open(
                    os.path.join(
                        preprocessor_dir,
                        self.patient_id + '_scaler_feature_' +
                        str(self.leave_out_seizure_idx_valid) + '_' +
                        str(self.leave_out_seizure_idx_test) + '.pkl'),
                    'wb') as f:
                pickle.dump(scaler, f)

            preprocessed_X = scaler.transform(raw_X)
        else:
            with open(
                    os.path.join(
                        preprocessor_dir,
                        self.patient_id + '_scaler_feature_' +
                        str(self.leave_out_seizure_idx_valid) + '_' +
                        str(self.leave_out_seizure_idx_test) + '.pkl'),
                    'rb') as f:
                scaler = pickle.load(f)

            preprocessed_X = scaler.transform(raw_X)

        raw_X = None

        if self.which_set == 'train' or self.which_set == 'valid_train':
            # Shuffle the data
            print ''
            print '*** Shuffle data ***'
            print ''
            permute_idx = np.random.permutation(preprocessed_X.shape[0])
            preprocessed_X = preprocessed_X[permute_idx, :]
            y = y[permute_idx, :]

        if self.balance_class and (self.which_set == 'train'
                                   or self.which_set == 'valid_train'):
            self.X_full = preprocessed_X
            self.y_full = y

            (X, y) = self.get_data()
        else:
            # Zero-padding (if necessary)
            if not (self.batch_size is None):
                preprocessed_X, y = self.zero_pad(preprocessed_X, y,
                                                  self.batch_size)

            X = preprocessed_X

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self, X=X, y=y, axes=axes)
예제 #5
0
    def __init__(self,
                 patient_id,
                 which_set,
                 list_features,
                 leave_out_seizure_idx_valid,
                 leave_out_seizure_idx_test,
                 data_dir,
                 preictal_sec,
                 use_all_nonictals,
                 preprocessor_dir,
                 n_selected_features=-1,
                 batch_size=None,
                 balance_class=True,
                 axes=('b', 0, 1, 'c'),
                 default_seed=0):

        self.balance_class = balance_class
        self.batch_size = batch_size

        tmp_list_features = np.empty(len(list_features), dtype=object)
        for f_idx in range(len(list_features)):
            tmp_list_features[f_idx] = FeatureList.get_info(list_features[f_idx])
        list_features = tmp_list_features

        print 'List of features:'
        for f in list_features:
            print f['feature'] + '.' + f['param']
        print ''

        EpilepsiaeFeatureLoader.__init__(self,
                                         patient_id=patient_id,
                                         which_set=which_set,
                                         list_features=list_features,
                                         leave_out_seizure_idx_valid=leave_out_seizure_idx_valid,
                                         leave_out_seizure_idx_test=leave_out_seizure_idx_test,
                                         data_dir=data_dir,
                                         preictal_sec=preictal_sec,
                                         use_all_nonictals=use_all_nonictals)
        # Row: samples, Col: features
        raw_X, y = self.load_data()

        if n_selected_features != -1:
            all_rank_df = None
            for f_idx, feature in enumerate(self.list_features):
                rank_df = pd.read_csv(os.path.join(data_dir, patient_id +
                                                 '/rank_feature_idx_' + feature['param'] + '_' +
                                                 'leaveout_' + str(leave_out_seizure_idx_valid) + '_' +
                                                 str(leave_out_seizure_idx_test) + '.txt'))
                if f_idx == 0:
                    all_rank_df = rank_df
                else:
                    offset_f_idx = 0
                    for i in range(f_idx):
                        offset_f_idx = offset_f_idx + self.list_features[i]['n_features']
                    rank_df['feature_idx'] = rank_df['feature_idx'].values + offset_f_idx
                    all_rank_df = pd.concat([all_rank_df, rank_df])

            sorted_feature_df = all_rank_df.sort(['D_ADH'], ascending=[0])
            self.selected_feature_idx = sorted_feature_df['feature_idx'][:n_selected_features]
            raw_X = raw_X[:, self.selected_feature_idx]
        else:
            self.selected_feature_idx = np.arange(raw_X.shape[1])

        # Print shape of input data
        print '------------------------------'
        print 'Dataset: {0}'.format(self.which_set)
        print 'Number of samples: {0}'.format(raw_X.shape[0])
        print ' Preictal samples: {0}'.format(self.preictal_samples)
        print ' Nonictal samples: {0}'.format(self.nonictal_samples)
        print ' NaN samples: {0}'.format(self.nan_non_flat_samples)
        print ' Note for ''train'' and ''valid_train'': number of samples will be equal without removing the nan samples.'
        print 'Number of features: {0}'.format(raw_X.shape[1])
        print '------------------------------'

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler = scaler.fit(raw_X)

            with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' +
                                                     str(self.leave_out_seizure_idx_valid) + '_' +
                                                     str(self.leave_out_seizure_idx_test) + '.pkl'), 'wb') as f:
                pickle.dump(scaler, f)

            preprocessed_X = scaler.transform(raw_X)
        else:
            with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' +
                                                     str(self.leave_out_seizure_idx_valid) + '_' +
                                                     str(self.leave_out_seizure_idx_test) + '.pkl'), 'rb') as f:
                scaler = pickle.load(f)

            preprocessed_X = scaler.transform(raw_X)

        raw_X = None

        if self.which_set == 'train' or self.which_set == 'valid_train':
            # Shuffle the data
            print ''
            print '*** Shuffle data ***'
            print ''
            permute_idx = np.random.permutation(preprocessed_X.shape[0])
            preprocessed_X = preprocessed_X[permute_idx, :]
            y = y[permute_idx, :]

        if self.balance_class and (self.which_set == 'train' or self.which_set == 'valid_train'):
            self.X_full = preprocessed_X
            self.y_full = y

            (X, y) = self.get_data()
        else:
            # Zero-padding (if necessary)
            if not (self.batch_size is None):
                preprocessed_X, y = self.zero_pad(preprocessed_X, y, self.batch_size)

            X = preprocessed_X

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self,
                                   X=X,
                                   y=y,
                                   axes=axes)
예제 #6
0
    def __init__(self, patient_id, which_set, preprocessor_path, data_dir, transform, window_size, batch_size,
                 specified_files=None, leave_one_out_file=None, axes=('b', 0, 1, 'c'), default_seed=0):
        """
        The CHBMIT dataset customized for leave-one-file-out cross validation.

        Parameters
        ----------
        patient_id : int
            Patient ID.
        which_set : string
            Name used to specify which partition of the dataset to be loaded (e.g., 'train', 'valid', or 'test').
            If not specified, all data will be loaded.
        preprocessor_path : string
            File path to store the scaler for pre-processing the EEG data.
        data_dir : string
            Directory that store the source EEG data.
        transform : string
            Specify how to transform the data. ('multiple_channels' | 'single_channel')
        window_size : int
            Size of each sample.
        batch_size : int
            Size of the batch, used for zero-padding to make the the number samples dividable by the batch size.
        specified_files : dictionary
            Dictionary to specified which files are used for training, validation and testing.
        leave_one_out_file : int
            Index of the withheld file.
        axes : tuple
            axes of the DenseDesignMatrix.
        default_seed : int, optional
            Seed for random.

        For preprocessing, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/datasets/preprocessing.py

        For customizing dataset, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/emotions_dataset.py

        """

        self.patient_id = patient_id
        self.data_dir = data_dir
        self.preprocessor_path = preprocessor_path
        self.window_size = window_size
        self.n_classes = 2
        self.default_seed = default_seed
        self.transform = transform
        self.specified_files = specified_files
        self.leave_one_out_file = leave_one_out_file
        self.batch_size = batch_size

        raw_X, raw_y = self._load_data(which_set=which_set)

        self.raw_X = raw_X
        self.raw_y = raw_y

        # Filter representative channels
        if not(self.rep_channel_matlab_idx.get(patient_id) is None):
            # Map the representative MATLAB index to python index
            # Also the raw_data read from the .mat file has already removed inactive channels
            # So we need to search for the match original index with MATLAB index
            # Then transfer to the python index
            self.rep_channel_python_idx = np.empty(0, dtype=int)
            for ch in self.rep_channel_matlab_idx[patient_id]:
                if ch in self.used_channel_matlab_idx:
                    ch_python_idx = np.where(ch == self.used_channel_matlab_idx)[0]
                    self.rep_channel_python_idx = np.append(self.rep_channel_python_idx, ch_python_idx)
                else:
                    raise Exception('There is no representative channel ' + str(ch) + ' in the input data.')
            assert np.all(self.used_channel_matlab_idx[self.rep_channel_python_idx] ==
                          self.rep_channel_matlab_idx[patient_id])

            raw_X = raw_X[self.rep_channel_python_idx, :]
            self.n_channels = self.rep_channel_python_idx.size

            print 'Used channel MATLAB index:', self.used_channel_matlab_idx
            print 'Representative channel MATLAB index:', self.rep_channel_matlab_idx[patient_id]
            print 'Representative channel Python index:', self.rep_channel_python_idx

        self.sample_shape = [self.window_size, 1, self.n_channels]
        self.sample_size = np.prod(self.sample_shape)

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            scaler = scaler.fit(raw_X.transpose())

            with open(self.preprocessor_path, 'w') as f:
                pickle.dump(scaler, f)

            scaled_X = scaler.transform(raw_X.transpose()).transpose()
        else:
            with open(self.preprocessor_path) as f:
                scaler = pickle.load(f)

            scaled_X = scaler.transform(raw_X.transpose()).transpose()

        # Transform data into format usable by the network
        if self.transform == 'multiple_channels':
            X, y, view_converter = self._transform_multi_channel_data(X=scaled_X, y=raw_y)
        elif self.transform == 'single_channel':
            X, y, view_converter = self._transform_single_channel_data(X=scaled_X, y=raw_y)
        else:
            raise Exception('Invalid transform mode.')

        # Zero-padding if the batch size is not compatible
        extra = (batch_size - X.shape[0]) % batch_size
        assert (X.shape[0] + extra) % batch_size == 0
        if extra > 0:
            X = np.concatenate((X, np.zeros((extra, X.shape[1]),
                                            dtype=float)),
                               axis=0)
            y = np.concatenate((y, np.zeros((extra, y.shape[1]),
                                            dtype=int)),
                               axis=0)
        assert X.shape[0] % batch_size == 0
        assert y.size % batch_size == 0

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self, X=X, y=y,
                                   view_converter=view_converter,
                                   axes=('b', 0, 1, 'c'))
예제 #7
0
    def __init__(self,
                 patient_id,
                 which_set,
                 preprocessor_path,
                 data_dir,
                 transform,
                 window_size,
                 batch_size,
                 specified_files=None,
                 leave_one_out_file=None,
                 axes=('b', 0, 1, 'c'),
                 default_seed=0):
        """
        The CHBMIT dataset customized for leave-one-file-out cross validation.

        Parameters
        ----------
        patient_id : int
            Patient ID.
        which_set : string
            Name used to specify which partition of the dataset to be loaded (e.g., 'train', 'valid', or 'test').
            If not specified, all data will be loaded.
        preprocessor_path : string
            File path to store the scaler for pre-processing the EEG data.
        data_dir : string
            Directory that store the source EEG data.
        transform : string
            Specify how to transform the data. ('multiple_channels' | 'single_channel')
        window_size : int
            Size of each sample.
        batch_size : int
            Size of the batch, used for zero-padding to make the the number samples dividable by the batch size.
        specified_files : dictionary
            Dictionary to specified which files are used for training, validation and testing.
        leave_one_out_file : int
            Index of the withheld file.
        axes : tuple
            axes of the DenseDesignMatrix.
        default_seed : int, optional
            Seed for random.

        For preprocessing, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/datasets/preprocessing.py

        For customizing dataset, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/emotions_dataset.py

        """

        self.patient_id = patient_id
        self.data_dir = data_dir
        self.preprocessor_path = preprocessor_path
        self.window_size = window_size
        self.n_classes = 2
        self.default_seed = default_seed
        self.transform = transform
        self.specified_files = specified_files
        self.leave_one_out_file = leave_one_out_file
        self.batch_size = batch_size

        raw_X, raw_y = self._load_data(which_set=which_set)

        self.raw_X = raw_X
        self.raw_y = raw_y

        # Filter representative channels
        if not (self.rep_channel_matlab_idx.get(patient_id) is None):
            # Map the representative MATLAB index to python index
            # Also the raw_data read from the .mat file has already removed inactive channels
            # So we need to search for the match original index with MATLAB index
            # Then transfer to the python index
            self.rep_channel_python_idx = np.empty(0, dtype=int)
            for ch in self.rep_channel_matlab_idx[patient_id]:
                if ch in self.used_channel_matlab_idx:
                    ch_python_idx = np.where(
                        ch == self.used_channel_matlab_idx)[0]
                    self.rep_channel_python_idx = np.append(
                        self.rep_channel_python_idx, ch_python_idx)
                else:
                    raise Exception('There is no representative channel ' +
                                    str(ch) + ' in the input data.')
            assert np.all(
                self.used_channel_matlab_idx[self.rep_channel_python_idx] ==
                self.rep_channel_matlab_idx[patient_id])

            raw_X = raw_X[self.rep_channel_python_idx, :]
            self.n_channels = self.rep_channel_python_idx.size

            print 'Used channel MATLAB index:', self.used_channel_matlab_idx
            print 'Representative channel MATLAB index:', self.rep_channel_matlab_idx[
                patient_id]
            print 'Representative channel Python index:', self.rep_channel_python_idx

        self.sample_shape = [self.window_size, 1, self.n_channels]
        self.sample_size = np.prod(self.sample_shape)

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            scaler = scaler.fit(raw_X.transpose())

            with open(self.preprocessor_path, 'w') as f:
                pickle.dump(scaler, f)

            scaled_X = scaler.transform(raw_X.transpose()).transpose()
        else:
            with open(self.preprocessor_path) as f:
                scaler = pickle.load(f)

            scaled_X = scaler.transform(raw_X.transpose()).transpose()

        # Transform data into format usable by the network
        if self.transform == 'multiple_channels':
            X, y, view_converter = self._transform_multi_channel_data(
                X=scaled_X, y=raw_y)
        elif self.transform == 'single_channel':
            X, y, view_converter = self._transform_single_channel_data(
                X=scaled_X, y=raw_y)
        else:
            raise Exception('Invalid transform mode.')

        # Zero-padding if the batch size is not compatible
        extra = (batch_size - X.shape[0]) % batch_size
        assert (X.shape[0] + extra) % batch_size == 0
        if extra > 0:
            X = np.concatenate((X, np.zeros((extra, X.shape[1]), dtype=float)),
                               axis=0)
            y = np.concatenate((y, np.zeros((extra, y.shape[1]), dtype=int)),
                               axis=0)
        assert X.shape[0] % batch_size == 0
        assert y.size % batch_size == 0

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self,
                                   X=X,
                                   y=y,
                                   view_converter=view_converter,
                                   axes=('b', 0, 1, 'c'))
예제 #8
0
    def __init__(self, patient_id, which_set, preprocessor_path, data_dir,
                 leave_one_out_seizure, sample_size_second, batch_size,
                 default_seed=0):
        """
        The Epilepsiae dataset customized for leave-one-seizure-out cross validation.

        Parameters
        ----------
        patient_id : int
            Patient ID.
        which_set : string
            Name used to specify which partition of the dataset to be loaded (e.g., 'train', 'valid', or 'test').
            If not specified, all data will be loaded.
        preprocessor_path : string
            File path to store the scaler for pre-processing the EEG data.
        data_dir : string
            Directory that store the source EEG data.
        leave_one_out_seizure : int
            Index of the withheld seizure.
        sample_size_second : int
            Number of seconds used to specify sample size.
        batch_size : int
            Size of the batch, used to remove a few samples to make the the number samples dividable by the batch size.
        default_seed : int, optional
            Seed for random.

        For preprocessing, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/datasets/preprocessing.py

        For customizing dataset, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/emotions_dataset.py

        """

        # Load data
        files = ['rec_26402102/26402102_0003.mat',
                 'rec_26402102/26402102_0007.mat',
                 'rec_26402102/26402102_0008.mat',
                 'rec_26402102/26402102_0017.mat']
        scalp_channels = np.asarray([   u'FP1',
                                        u'FP2',
                                        u'F3',
                                        u'F4',
                                        u'C3',
                                        u'C4',
                                        u'P3',
                                        u'P4',
                                        u'O1',
                                        u'O2',
                                        u'F7',
                                        u'F8',
                                        u'T3',
                                        u'T4',
                                        u'T5',
                                        u'T6',
                                        u'FZ',
                                        u'CZ',
                                        u'PZ'   ])
        # Get seizure information
        seizure_info = pd.read_table(os.path.join(data_dir, 'RECORDS-WITH-SEIZURES.txt'), sep='\t')
        seizure_info['filename'] = seizure_info['filename'].str.replace('.data', '.mat', case=False)

        self.data_dir = data_dir
        self.files = files
        self.seizure_info = seizure_info
        self.filter_channels = scalp_channels
        self.default_seed = default_seed
        self.leave_one_out_seizure = leave_one_out_seizure
        self.batch_size = batch_size

        X, y, n_channels, sample_size = self.load_data(which_set, sample_size_second, batch_size, preprocessor_path)
        self.n_channels = n_channels
        self.sample_size = sample_size

        view_converter = DefaultViewConverter((1, sample_size, 1))
        view_converter.set_axes(axes=['b', 0, 1, 'c'])

        DenseDesignMatrix.__init__(self, X=X, y=y,
                                   view_converter=view_converter,
                                   axes=['b', 0, 1, 'c'])
예제 #9
0
    def __init__(self,
                 patient_id,
                 which_set,
                 preprocessor_path,
                 data_dir,
                 leave_one_out_seizure,
                 sample_size_second,
                 batch_size,
                 default_seed=0):
        """
        The Epilepsiae dataset customized for leave-one-seizure-out cross validation.

        Parameters
        ----------
        patient_id : int
            Patient ID.
        which_set : string
            Name used to specify which partition of the dataset to be loaded (e.g., 'train', 'valid', or 'test').
            If not specified, all data will be loaded.
        preprocessor_path : string
            File path to store the scaler for pre-processing the EEG data.
        data_dir : string
            Directory that store the source EEG data.
        leave_one_out_seizure : int
            Index of the withheld seizure.
        sample_size_second : int
            Number of seconds used to specify sample size.
        batch_size : int
            Size of the batch, used to remove a few samples to make the the number samples dividable by the batch size.
        default_seed : int, optional
            Seed for random.

        For preprocessing, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/datasets/preprocessing.py

        For customizing dataset, see more in
            https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/emotions_dataset.py

        """

        # Load data
        files = [
            'rec_26402102/26402102_0003.mat', 'rec_26402102/26402102_0007.mat',
            'rec_26402102/26402102_0008.mat', 'rec_26402102/26402102_0017.mat'
        ]
        scalp_channels = np.asarray([
            u'FP1', u'FP2', u'F3', u'F4', u'C3', u'C4', u'P3', u'P4', u'O1',
            u'O2', u'F7', u'F8', u'T3', u'T4', u'T5', u'T6', u'FZ', u'CZ',
            u'PZ'
        ])
        # Get seizure information
        seizure_info = pd.read_table(os.path.join(data_dir,
                                                  'RECORDS-WITH-SEIZURES.txt'),
                                     sep='\t')
        seizure_info['filename'] = seizure_info['filename'].str.replace(
            '.data', '.mat', case=False)

        self.data_dir = data_dir
        self.files = files
        self.seizure_info = seizure_info
        self.filter_channels = scalp_channels
        self.default_seed = default_seed
        self.leave_one_out_seizure = leave_one_out_seizure
        self.batch_size = batch_size

        X, y, n_channels, sample_size = self.load_data(which_set,
                                                       sample_size_second,
                                                       batch_size,
                                                       preprocessor_path)
        self.n_channels = n_channels
        self.sample_size = sample_size

        view_converter = DefaultViewConverter((1, sample_size, 1))
        view_converter.set_axes(axes=['b', 0, 1, 'c'])

        DenseDesignMatrix.__init__(self,
                                   X=X,
                                   y=y,
                                   view_converter=view_converter,
                                   axes=['b', 0, 1, 'c'])