예제 #1
0
class BData(object):
    """
    Class for BdPy/BrainDecoderToolbox2 data (dataSet and metaData)

    The instance of class `BData` contains `dataSet` and `metaData` as instance
    variables.

    Attributes
    ----------
    dataSet : numpy array (dtype=float)
        Dataset array
    metaData : MetaData object
        Meta-data object
    """


    def __init__(self, file_name=None, file_type=None):
        """
        Initializer of a BData instance

        Parameters
        ----------
        file_name : str, optional
            File which contains BData (default: None)
        file_type : {'Npy', 'Matlab', 'HDF5'}
            File type (default: None)
        """

        self.dataSet = np.ndarray((0, 0), dtype=float)
        self.metaData = MetaData()

        if file_name is not None:
            self.load(file_name, file_type)


    ## Public APIs #####################################################

    def add_dataset(self, x, attribute_key):
        """
        Add `x` to dataSet with attribute meta-data key `attribute_key`

        Parameters
        ----------
        x : array
            Data matrix to be added in dataSet
        attribute_key : str
            Key of attribute meta-data, which specifies the columns containing `x`
        """

        if x.ndim == 1:
            x = x[:, np.newaxis]

        colnum_has = self.dataSet.shape[1] # Num of existing columns in 'dataSet'
        colnum_add = x.shape[1]            # Num of columns to be added

        ## Add 'x' to dataset
        if not self.dataSet.size:
            self.dataSet = x
        else:
            # TODO: Add size check of 'x' and 'self.dataSet'
            self.dataSet = np.hstack((self.dataSet, x))

        ## Add new attribute metadata
        attribute_description = 'Attribute: %s = 1' % attribute_key
        attribute_value = [None for _ in xrange(colnum_has)] + [1 for _ in xrange(colnum_add)]
        self.metaData.set(attribute_key, attribute_value, attribute_description,
                          lambda x, y: np.hstack((y[:colnum_has], x[-colnum_add:])))


    def add_metadata(self, key, value, description='', attribute=None):
        """
        Add meta-data with `key`, `description`, and `value` to metaData

        Parameters
        ----------
        key : str
            Meta-data key
        value : array
            Meta-data array
        description : str, optional
            Meta-data description
        attribute : str, optional
            Meta-data key specifying data attribution
        """

        # TODO: Add attribute specifying
        # TODO: Add size check of metaData/value

        if attribute is not None:
            attr_ind = np.asarray(np.nan_to_num(self.metaData.get(attribute, 'value')), dtype=np.bool)
            # nan is converted to True in np.bool and thus change nans in metaData/value to zero at first

            add_value = np.array([None for _ in xrange(self.metaData.get_value_len())])
            add_value[attr_ind] = value
        else:
            add_value = value

        self.metaData.set(key, add_value, description)


    def rename_meatadata(self, key_old, key_new):
        """
        Rename a meta-data key

        Parameters
        ----------
        key_old, key_new : str
            Old and new meta-data keys
        """
        self.metaData[key_new] = self.metaData[key_old]
        del self.metaData[key_old]


    def set_metadatadescription(self, key, description):
        """
        Set description of metadata specified by `key`

        Parameters
        ----------
        key : str
            Meta-data key
        description : str
            Meta-data description
        """

        self.metaData.set(key, None, description,
                          lambda x, y: y)


    def select_dataset(self, condition, return_index=False, verbose=True):
        """
        Extracts features from dataset based on condition

        Parameters
        ----------
        condition : str
            Expression specifying feature selection
        retrun_index : bool, optional
            If True, returns index of selected features (default: False)
        verbose : bool, optional
            If True, display verbose messages (default: True)

        Returns
        -------
        array
            Selected feature data and index (if specified)
        list, optional
            Selected index

        Note
        ----

        Operators: | (or), & (and), = (equal), @ (conditional)
        """

        expr_rpn = FeatureSelector(condition).rpn

        stack = []
        buf_sel = []

        for i in expr_rpn:
            if i == '=':
                r = stack.pop()
                l = stack.pop()

                stack.append(np.array([n == r for n in l], dtype=bool))

            elif i == 'top':
                # Dirty solution

                # Need fix on handling 'None'

                n = int(stack.pop()) # Num of elements to be selected
                v = stack.pop()

                order = self.__get_order(v)

                stack.append(order)
                buf_sel.append(n)

            elif i == '|' or i == '&':
                r = stack.pop()
                l = stack.pop()

                if r.dtype != 'bool':
                    # 'r' should be an order vector
                    num_sel = buf_sel.pop()
                    r = self.__get_top_elm_from_order(r, num_sel)
                    #r = np.array([ n < num_sel for n in r ], dtype = bool)

                if l.dtype != 'bool':
                    # 'l' should be an order vector
                    num_sel = buf_sel.pop()
                    l = self.__get_top_elm_from_order(l, num_sel)
                    #l = np.array([ n < num_sel for n in l ], dtype = bool)

                if i == '|':
                    result = np.logical_or(l, r)
                elif i == '&':
                    result = np.logical_and(l, r)

                stack.append(result)

            elif i == '@':
                # FIXME
                # In the current version, the right term of '@' is assumed to
                # be a boolean, and the left is to be an order vector.

                r = stack.pop() # Boolean
                l = stack.pop() # Float

                l[~r] = np.inf

                selind = self.__get_top_elm_from_order(l, buf_sel.pop())

                stack.append(np.array(selind))

            else:
                if isinstance(i, str):
                    if i.isdigit():
                        # 'i' should be a criteria value
                        i = float(i)
                    else:
                        # 'i' should be a meta-data key
                        i = np.array(self.get_metadata(i))

                stack.append(i)

        selected_index = stack.pop()

        # If buf_sel still has an element, `select_index` should be an order vector.
        # Select N elements based on the order vector.
        if buf_sel:
            num_sel = buf_sel.pop()
            selected_index = [n < num_sel for n in selected_index]

        # get whole dataset
        #data = self.get_dataset()

        # slice dataset based on selected column
        #feature = data[:, np.array(selected_index)]

        if return_index:
            return self.dataSet[:, np.array(selected_index)], selected_index
        else:
            return self.dataSet[:, np.array(selected_index)]


    def get_metadata(self, key):
        """
        Get value of meta-data specified by 'key'
        """

        return self.metaData.get(key, 'value')


    def show_metadata(self):
        """
        Show all the key and description in metaData
        """

        for m in self.metaData:
            print "%s: %s" % (m['key'], m['description'])


    def load(self, load_filename, load_type=None):
        """
        Load 'dataSet' and 'metaData' from a given file
        """

        if load_type is None:
            load_type = self.__get_filetype(load_filename)

        if load_type == "Npy":
            self.__load_npy(load_filename)
        elif load_type == "Matlab":
            self.__load_mat(load_filename)
        elif load_type == "HDF5":
            self.__load_h5(load_filename)
        else:
            raise ValueError("Unknown file type: %s" % (load_type))


    def save(self, file_name, file_type=None):
        """
        Save 'dataSet' and 'metaData' to a file
        """

        if file_type is None:
            file_type = self.__get_filetype(file_name)

        if file_type == "Npy":
            np.save(file_name, {"dataSet": self.dataSet,
                                "metaData": self.metaData})
        elif file_type == "Matlab":
            md_key = []
            md_desc = []
            md_value = []

            for m in self.metaData:
                md_key.append(m['key'])
                md_desc.append(m['description'])

                v_org = m['value']
                v_nan = []

                # Convert 'None' to 'np.nan'
                for v in v_org:
                    if v is None:
                        v_nan.append(np.nan)
                    else:
                        v_nan.append(v)

                md_value.append(v_nan)

            # 'key' and 'description' are saved as cell arrays
            sio.savemat(file_name, {"dataSet" : self.dataSet,
                                    "metaData" : {"key" : np.array(md_key, dtype=np.object),
                                                  "description" : np.array(md_desc, dtype=np.object),
                                                  "value" : md_value}})

        elif file_type == "HDF5":
            self.__save_h5(file_name)

        else:
            raise ValueError("Unknown file type: %s" % (file_type))


    ## Public APIs (obsoleted) #########################################

    def get_dataset(self, key=None):
        """
        Get dataSet from BData object

        When `key` is not given, `get_dataset` returns `dataSet`. When `key` is
        given, `get_dataset` returns data specified by `key`
        """

        if key is None:
            return self.dataSet
        else:
            query = '%s = 1' % key
            return self.select_dataset(query, return_index=False, verbose=False)


    ## Feature selection #######################################################

    def edit_metadatadescription(self, metakey, description):
        """
        Add or edit description of metadata based on key

        This method is obsoleted and will be removed in the future release.
        Use `set_metadatadescription` instead.

        Parameters
        ----------
        key : str
            Meta-data key
        description : str
            Meta-data description
        """
        self.set_metadatadescription(metakey, description)


    def select_feature(self, condition, return_index=False, verbose=True):
        """
        Extracts features from dataset based on condition

        Parameters
        ----------
        condition : str
            Expression specifying feature selection
        retrun_index : bool, optional
            If True, returns index of selected features (default: False)
        verbose : bool, optional
            If True, display verbose messages (default: True)

        Returns
        -------
        array
            Selected feature data and index (if specified)
        list, optional
            Selected index

        Note
        ----

        Operators: | (or), & (and), = (equal), @ (conditional)
        """
        return self.select_dataset(condition, return_index, verbose)


    ## Private methods #################################################

    def __get_order(self, v, sort_order='descend'):

        # 'np.nan' comes to the last of an acending series, and thus the top of a decending series.
        # To avoid that, convert 'np.nan' to -Inf.
        v[np.isnan(v)] = -np.inf

        sorted_index = np.argsort(v)[::-1] # Decending order
        order = range(len(v))
        for i, x in enumerate(sorted_index):
            order[x] = i

        return np.array(order, dtype=float)


    def __get_top_elm_from_order(self, order, n):
        """Get a boolean index of top 'n' elements from 'order'"""
        sorted_index = np.argsort(order)
        for i, x in enumerate(sorted_index):
            order[x] = i

        index = np.array([r < n for r in order], dtype=bool)

        return index


    def __save_h5(self, file_name):
        """
        Save data in HDF5 format (*.h5)
        """

        with h5py.File(file_name, 'w') as h5file:
            # dataSet
            h5file.create_dataset('/dataSet', data=self.dataSet)

            # metaData
            md_keys = [m['key'] for m in self.metaData]
            md_desc = [m['description'] for m in self.metaData]
            md_vals = np.array([m['value'] for m in self.metaData], dtype=np.float)

            h5file.create_group('/metaData')
            h5file.create_dataset('/metaData/key', data=md_keys)
            h5file.create_dataset('/metaData/description', data=md_desc)
            h5file.create_dataset('/metaData/value', data=md_vals)


    def __load_npy(self, load_filename):
        """
        Load dataSet and metaData from Npy file
        """

        dat = np.load(load_filename)
        dicdat = dat.item()

        self.dataSet = dicdat["dataSet"]
        self.metaData = dicdat["metaData"]


    def __load_mat(self, load_filename):
        """
        Load dataSet and metaData from Matlab file
        """

        dat = sio.loadmat(load_filename)

        md_keys = [str(i[0]).strip() for i in np.asarray(dat["metaData"]['key'][0, 0])[0].tolist()]
        md_descs = [str(i[0]).strip() for i in np.asarray(dat["metaData"]['description'][0, 0])[0].tolist()]
        md_values = np.asarray(dat["metaData"]['value'][0, 0])

        self.dataSet = np.asarray(dat["dataSet"])

        for k, v, d in zip(md_keys, md_values, md_descs):
            self.add_metadata(k, v, d)


    def __load_h5(self, load_filename):
        """
        Load dataSet and metaData from HDF5 file
        """

        dat = h5py.File(load_filename)

        md_keys = dat["metaData"]['key'][:].tolist()
        md_descs = dat["metaData"]['description'][:].tolist()
        md_values = dat["metaData"]['value']

        self.dataSet = np.asarray(dat["dataSet"])

        for k, v, d in zip(md_keys, md_values, md_descs):
            self.add_metadata(k, v, d)


    def __get_filetype(self, file_name):
        """
        Return the type of `file_name` based on the file extension
        """
        _, ext = os.path.splitext(file_name)

        if ext == ".npy":
            file_type = "Npy"
        elif ext == ".mat":
            file_type = "Matlab"
        elif ext == ".h5":
            file_type = "HDF5"
        else:
            raise ValueError("Unknown file extension: %s" % (ext))

        return file_type