Exemplo n.º 1
0
    def subsample(self, index_locations):
        """
        Method to get index locations as a sample object
        :param index_locations: list, tuple, numpy array or integer of index locations
        :returns: Sample object
        """
        if isinstance(index_locations, list) or \
                isinstance(index_locations, tuple) or \
                isinstance(index_locations, np.ndarray) or \
                isinstance(index_locations, int):

            warnings.simplefilter('ignore')
            outsamp = Samples()
            warnings.simplefilter('default')
            outsamp.x_name = self.x_name
            outsamp.y_name = self.y_name

            if isinstance(index_locations, int):
                loc = np.array([index_locations])
            else:
                loc = np.array(index_locations)

            outsamp.x = self.x[np.array(loc), :]
            outsamp.y = self.y[np.array(loc)]

            outsamp.nsamp = outsamp.x.shape[0]
            outsamp.index = np.arange(0, outsamp.nsamp)
            outsamp.nfeat = outsamp.x.shape[1]

            return outsamp

        else:
            raise TypeError(
                "subsample() method works for list, tuple, numpy array or integer data types only"
            )
Exemplo n.º 2
0
    def euc_dist(vec1, vec2):
        """
        Method to calculate euclidean distance between two vectors
        :param vec1: first vector
        :param vec2: second vector
        :return: scalar
        """

        return np.linalg.norm(np.array(vec1) - np.array(vec2))
Exemplo n.º 3
0
    def difference(self, transpose=False):
        """
        Method to calculate difference from scene center
        :return: matrix (numpy.ndarray)
        """
        center = self.center

        diff_matrix = np.apply_along_axis(lambda row: np.array(row) - center,
                                          axis=1,
                                          arr=np.array(self.matrix))

        if transpose:
            return diff_matrix.T
        else:
            return diff_matrix
Exemplo n.º 4
0
    def select(self, index_list):
        """
        Method to select samples based on an index list
        :param index_list:
        :return: Samples object
        """
        if type(index_list) in (list, tuple, None):
            index_list = np.array(list(set(index_list.copy())))

        if (np.max(index_list) > self.nsamp) or (np.min(index_list) < 0):
            raise ValueError(
                "Index list out of bounds with {} min and/or {} max".format(
                    str(np.min(index_list)), str(np.max(index_list))))

        else:
            warnings.simplefilter('ignore')
            samp = Samples()
            warnings.simplefilter('default')
            samp.x_name = self.x_name
            samp.y_name = self.y_name
            samp.x = self.x[index_list, :]
            samp.y = self.y[index_list]
            samp.nsamp = samp.x.shape[0]
            samp.nfeat = samp.x.shape[1]
            samp.index = np.arange(0, samp.nsamp)

            if np.issubdtype(samp.x.dtype, np.number):
                samp.xmin = samp.x.min(0, initial=-self.max_allow_x)
                samp.xmax = samp.x.max(0, initial=self.max_allow_x)
            if np.issubdtype(samp.x.dtype, np.number):
                samp.ymin = samp.y.min(initial=-self.max_allow_y)
                samp.ymax = samp.y.max(initial=self.max_allow_y)

            return samp
Exemplo n.º 5
0
 def mat_dist(vec1, mat1):
     """
     Method to calculate euclidean distance between between a vector and all the vectors in a matrix
     :param vec1: vector
     :param mat1: matrix (numpy array of vectors)
     :return: numpy array of scalars
     """
     return np.apply_along_axis(
         lambda x: Euclidean.euc_dist(x, np.array(vec1)), 1, mat1)
Exemplo n.º 6
0
    def add_column(self,
                   column_name=None,
                   column_data=None,
                   column_order=None):
        """
        Function to add a column to the samples matrix.
        Column_order keyword is used after appending the column name and data to the right of the matrix
        but if column_data is None, self.x is re-ordered according to column_order
        :param column_name: Name of column to be added
        :param column_data: List of column values to be added
        :param column_order: List of numbers specifying column order for the column to be added
                            (e.g. if for three samples, the first value in column_data
                            is for second column, second value for first, third value for third,
                            the column_order is [1, 0, 2]
        :return: Samples object with added column
        """

        if column_data is None:
            if column_order is not None:
                self.x = self.x[:, np.array(column_order)]
                self.x_name = list(self.x_name[i] for i in column_order)
                return
            else:
                RuntimeError('No argument for add operation')
        else:
            column_data_ = np.array(column_data)
            self.x = np.hstack((self.x, column_data_[:, np.newaxis]))

            if column_name is None:
                column_name = 'Column_{}'.format(str(len(self.x_name) + 1))

            self.x_name.append(column_name)

            if column_order is None or len(column_order) != self.x.shape[1]:
                warnings.warn('Inconsistent or missing order - ignored')
                column_order = list(range(0, self.x.shape[1]))

            self.x = self.x[:, np.array(column_order)]
            self.x_name = list(self.x_name[i] for i in column_order)

            self.columns = list(range(0, self.x.shape[1]))
            self.nvar = len(self.columns)
            self.nfeat = self.x.shape[1]
Exemplo n.º 7
0
 def cluster_center(self, method='median'):
     """
     Method to determine cluster center of the sample matrix
     :param method: Type of reducer to use. Options: 'mean', 'median', 'percentile_xx' where xx is 1-99
     :return: Cluster center (vector of column/dimension values)
     """
     if self.matrix is not None:
         if method == 'median':
             self.center = np.array(np.median(self.matrix, axis=0))[0]
         elif method == 'mean':
             self.center = np.array(np.mean(self.matrix, axis=0))[0]
         elif 'percentile' in method:
             perc = int(method.replace('percentile', '')[1:])
             self.center = np.array(np.percentile(self.matrix, perc,
                                                  axis=0))[0]
         else:
             raise ValueError("Invalid or no reducer")
     else:
         raise ValueError("Sample matrix not found")
Exemplo n.º 8
0
    def sample_matrix(self):
        """
        Method to convert sample dictionaries to sample matrix
        :return: Numpy matrix with columns as dimensions and rows as individual samples
        """
        # dimensions of the sample matrix
        nsamp = len(self.samples)
        nvar = len(self.names)

        if nsamp > 1:
            # copy data to matrix
            self.matrix = np.array([[
                Handler.string_to_type(self.samples[i][self.names[j]])
                for j in range(0, nvar)
            ] for i in range(0, self.nsamp)])
        else:
            raise ValueError('Not enough samples to make a matrix object')
Exemplo n.º 9
0
    def select_inverse(self, index_list):
        """
        Method to select samples other than those on the index list
        :param index_list:
        :return: Samples object
        """

        if type(index_list) in (list, tuple, None):
            index_list = np.array(list(set(index_list.copy())))

        if (np.max(index_list) > self.nsamp) or (np.min(index_list) < 0):
            raise ValueError(
                "Index list out of bounds with {} min and/or {} max".format(
                    str(np.min(index_list)), str(np.max(index_list))))

        else:
            reverse_indices = self.index[~np.in1d(self.index, index_list)]
            return self.select(reverse_indices)
Exemplo n.º 10
0
    def covariance(self, inverse=False):
        """
        Method to calculate a covariance matrix for a given sample matrix
        where rows are samples, columns are dimensions
        :param inverse: Should the inverse matrix be calculated
        :return: Covariance or inverse covariance matrix (numpy.matrix object)
        """
        cov_mat = np.cov(self.matrix, rowvar=False)

        if inverse:
            # Inverse using SVD
            u, s, v = np.linalg.svd(cov_mat)

            try:
                return np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T)

            except ValueError:
                return None
        else:
            return np.array(cov_mat)
Exemplo n.º 11
0
    def select_features(self, name_list=None):
        """
        Method to return a Samples instance using a selection of feature names
        :param name_list: List of feature names to make a new Samples() instance from
        :returns: Samples instance
        """
        indx_list = []
        for name in name_list:
            indx_list.append(self.x_name.index(name))

        samp = Samples(label_colname=self.y_name,
                       x=self.x[:, np.array(indx_list)],
                       y=self.y,
                       x_name=name_list,
                       y_name=self.y_name,
                       weights=self.weights,
                       weights_colname=self.weights_colname,
                       use_band_dict=self.use_band_dict,
                       max_allow_x=self.max_allow_x,
                       max_allow_y=self.max_allow_y)

        samp.csv_file = self.csv_file
        return samp
Exemplo n.º 12
0
    def __init__(self,
                 csv_file=None,
                 label_colname=None,
                 x=None,
                 y=None,
                 x_name=None,
                 y_name=None,
                 weights=None,
                 weights_colname=None,
                 use_band_dict=None,
                 max_allow_x=1e13,
                 max_allow_y=1e13,
                 line_limit=None,
                 remove_null=True,
                 **kwargs):
        """
        :param csv_file: csv file that contains the features (training or validation samples)
        :param label_colname: column in csv file that contains the feature label (output value)
        :param x: 2d array containing features (samples) without the label
        :param y: 1d array of feature labels (same order as x)
        :param x_name: 1d array of feature names (bands).
                       Can be used to select which columns to read from csv  file.
        :param y_name: name of label
        :param use_band_dict: list of attribute (band) names
        :param max_allow_x: Maximum allowed values of x
        :param max_allow_y: Maximum allowed value of y
        """
        self.csv_file = csv_file
        self.label_colname = label_colname

        if type(x).__name__ in ('ndarray', 'NoneType'):
            self.x = x
        else:
            self.x = np.array(list(x))

        self.x_name = x_name

        if type(y).__name__ in ('ndarray', 'NoneType'):
            self.y = y
        else:
            self.y = np.array(list(y))

        self.y_name = y_name

        self.weights = weights
        self.weights_colname = weights_colname
        self.use_band_dict = use_band_dict

        self.index = None
        self.nfeat = None

        self.xmin = None
        self.xmax = None
        self.ymin = None
        self.ymax = None

        self.y_hist = None
        self.y_bin_edges = None
        self.x_hist = None
        self.x_bin_edges = None

        self.max_allow_x = max_allow_x
        self.max_allow_y = max_allow_y

        # label name or csv file are provided
        if (label_colname is not None) and (csv_file is not None):

            temp = Handler(filename=csv_file).read_from_csv(
                return_dicts=True, line_limit=line_limit)
            header = list(temp[0])

            # label name doesn't match
            if label_colname in header:
                loc = header.index(label_colname)
            else:
                raise ValueError("Label name mismatch.\nAvailable names: " +
                                 ', '.join(header))

            feat_names = header.copy()
            _ = feat_names.pop(loc)

            # read from data dictionary
            if self.x_name is not None and type(self.x_name) in (list, tuple):
                self.x_name = [
                    elem for elem in feat_names if elem in self.x_name
                ]
            else:
                self.x_name = feat_names

            clean_list = []
            if remove_null:
                for elem_dict in temp:
                    val_chk = list((elem in (
                        None, '', ' ', 'null', 'NULL', '<null>',
                        '<NULL>')) or (elem in (int, float) and np.isnan(elem))
                                   for elem in elem_dict.values())
                    if any(val_chk):
                        continue
                    else:
                        clean_list.append(elem_dict)
            else:
                clean_list = temp

            self.x = np.array(
                list(
                    list(samp_dict[feat_name] for feat_name in feat_names)
                    for samp_dict in clean_list))
            self.y = np.array(
                list(samp_dict[label_colname] for samp_dict in clean_list))
            self.y_name = label_colname

            # if band name dictionary is provided
            if use_band_dict is not None:
                self.y_name = [use_band_dict[b] for b in self.y_name]

        elif (label_colname is None) and (csv_file is not None):
            temp = Handler(filename=csv_file).read_from_csv(
                return_dicts=True, line_limit=line_limit)

            clean_list = []
            if remove_null:
                for elem_dict in temp:
                    val_chk = list((elem in (
                        None, '', ' ', 'null', 'NULL', '<null>',
                        '<NULL>')) or (elem in (int, float) and np.isnan(elem))
                                   for elem in elem_dict.values())
                    if any(val_chk):
                        continue
                    else:
                        clean_list.append(elem_dict)
            else:
                clean_list = temp

            # read from data dictionary
            feat_names = list(clean_list[0].keys())
            if self.x_name is not None and type(self.x_name) in (list, tuple):
                self.x_name = [
                    elem for elem in feat_names if elem in self.x_name
                ]
            else:
                self.x_name = feat_names
            self.x = np.array(
                list(
                    list(samp_dict[feat_name] for feat_name in self.x_name)
                    for samp_dict in clean_list))

        else:
            warnings.warn(
                "Samples class initiated without data file and/or label",
                category=RuntimeWarning,
                stacklevel=1)

        if self.x is not None and self.y is not None:
            if self.y_name is None:
                self.y_name = 'y'
            if (self.x_name is None) or \
                    (type(self.x_name) not in (list, tuple)) or \
                    (len(self.x_name) != self.x.shape[1]):
                self.x_name = list('x{}'.format(str(i + 1))
                                   for i in range(self.x.shape[1]))

        if weights is None:
            if weights_colname is not None:
                if csv_file is not None:

                    # label name doesn't match
                    if any(weights_colname in n for n in self.x_name):
                        loc = self.x_name.index(weights_colname)
                    else:
                        raise ValueError("Weight column name mismatch")

                    self.weights = self.x[:, loc]
                    self.x = np.delete(self.x, loc, 1)

                else:
                    raise ValueError("No csv_file specified for weights")

        # if keywords are supplied
        if kwargs is not None:

            # columns containing data
            if 'columns' in kwargs:
                if type(kwargs['columns']).__name__ == 'list':
                    self.columns = np.array(kwargs['columns'])
                elif type(kwargs['columns']).__name__ in ('ndarray',
                                                          'NoneType'):
                    self.columns = kwargs['columns']
                else:
                    self.columns = np.array(list(kwargs['columns']))
            else:
                self.columns = None

            # IDs of samples
            if 'ids' in kwargs:
                self.ids = kwargs['ids']
            else:
                self.ids = None

        else:
            self.columns = None
            self.ids = None

        if self.x is not None:

            if self.columns is None:
                self.columns = np.arange(0, self.x.shape[1])

            self.nsamp = self.x.shape[0]
            self.nvar = self.x.shape[1]

            self.nfeat = self.x.shape[1]

            if np.issubdtype(self.x.dtype, np.number):
                self.xmin = self.x.min(0, initial=max_allow_x)
                self.xmax = self.x.max(0, initial=max_allow_y)

            self.index = np.arange(0, self.x.shape[0])

        else:
            self.nsamp = 0
            self.nvar = 0

        if self.y is not None:
            if np.issubdtype(self.y.dtype, np.number):
                self.ymin = self.y.min(initial=-max_allow_y)
                self.ymax = self.y.max(initial=max_allow_y)

        if self.y is not None:
            self.head = '\n'.join(
                list(
                    str(elem)
                    for elem in [' '.join(list(self.x_name) + [self.y_name])] +
                    list(' '.join(
                        list(
                            str(elem_)
                            for elem_ in self.x[i, :].tolist() + [self.y[i]]))
                         for i in range(10))))
        else:
            self.head = '<empty>'