def random_partition(self, percentage=75): """ Method to randomly partition the samples based on a percentage :param percentage: Partition percentage (default: 75) (e.g. 75 for 75% training samples and 25% validation samples) :return: Tuple (Training sample object, validation sample object) """ ntrn = int((percentage * self.nsamp) / 100.0) # randomly select training samples based on number trn_sites = np.random.choice(self.index, size=ntrn, replace=False) val_sites = self.index[~np.in1d(self.index, trn_sites)] # training sample object warnings.simplefilter('ignore') trn_samp = Samples() warnings.simplefilter('default') trn_samp.x_name = self.x_name trn_samp.y_name = self.y_name trn_samp.x = self.x[trn_sites, :] trn_samp.y = self.y[trn_sites] trn_samp.nsamp = trn_samp.x.shape[0] trn_samp.index = np.arange(0, trn_samp.nsamp) trn_samp.nfeat = trn_samp.x.shape[1] if np.issubdtype(self.x.dtype, np.number): trn_samp.xmin = trn_samp.x.min(0, initial=-self.max_allow_x) trn_samp.xmax = trn_samp.x.max(0, initial=self.max_allow_x) if np.issubdtype(self.y.dtype, np.number): trn_samp.ymin = trn_samp.y.min(initial=-self.max_allow_y) trn_samp.ymax = trn_samp.y.max(initial=self.max_allow_y) # validation sample object warnings.simplefilter('ignore') val_samp = Samples() warnings.simplefilter('default') val_samp.x_name = self.x_name val_samp.y_name = self.y_name val_samp.x = self.x[val_sites, :] val_samp.y = self.y[val_sites] val_samp.nsamp = val_samp.x.shape[0] val_samp.index = np.arange(0, val_samp.nsamp) val_samp.nfeat = val_samp.x.shape[1] if np.issubdtype(self.x.dtype, np.number): val_samp.xmin = val_samp.x.min(0, initial=-self.max_allow_x) val_samp.xmax = val_samp.x.max(0, initial=self.max_allow_x) if np.issubdtype(self.y.dtype, np.number): val_samp.ymin = val_samp.y.min(initial=-self.max_allow_y) val_samp.ymax = val_samp.y.max(initial=self.max_allow_y) return trn_samp, val_samp
def select(self, index_list): """ Method to select samples based on an index list :param index_list: :return: Samples object """ if type(index_list) in (list, tuple, None): index_list = np.array(list(set(index_list.copy()))) if (np.max(index_list) > self.nsamp) or (np.min(index_list) < 0): raise ValueError( "Index list out of bounds with {} min and/or {} max".format( str(np.min(index_list)), str(np.max(index_list)))) else: warnings.simplefilter('ignore') samp = Samples() warnings.simplefilter('default') samp.x_name = self.x_name samp.y_name = self.y_name samp.x = self.x[index_list, :] samp.y = self.y[index_list] samp.nsamp = samp.x.shape[0] samp.nfeat = samp.x.shape[1] samp.index = np.arange(0, samp.nsamp) if np.issubdtype(samp.x.dtype, np.number): samp.xmin = samp.x.min(0, initial=-self.max_allow_x) samp.xmax = samp.x.max(0, initial=self.max_allow_x) if np.issubdtype(samp.x.dtype, np.number): samp.ymin = samp.y.min(initial=-self.max_allow_y) samp.ymax = samp.y.max(initial=self.max_allow_y) return samp
def random_selection(self, num=10): """ Method to select a smaller number of samples from the Samples object :param num: Number of samples to select :return: Samples object """ if num >= self.index.shape[0]: print('Number larger than population: {} specified for {} samples'. format(str(num), str(len(self.index)))) ran_samp_n = self.index else: ran_samp_n = np.random.choice(self.index, size=num, replace=False) # training sample object warnings.simplefilter('ignore') ran_samp = Samples() warnings.simplefilter('default') ran_samp.x_name = self.x_name ran_samp.y_name = self.y_name ran_samp.x = self.x[ran_samp_n, :] ran_samp.y = self.y[ran_samp_n] ran_samp.nsamp = ran_samp.x.shape[0] ran_samp.nfeat = ran_samp.x.shape[1] ran_samp.index = np.arange(0, ran_samp.nsamp) if np.issubdtype(ran_samp.x.dtype, np.number): ran_samp.xmin = ran_samp.x.min(0, initial=-self.max_allow_x) ran_samp.xmax = ran_samp.x.max(0, initial=self.max_allow_x) if np.issubdtype(ran_samp.y.dtype, np.number): ran_samp.ymin = ran_samp.y.min(initial=-self.max_allow_y) ran_samp.ymax = ran_samp.y.max(initial=self.max_allow_y) return ran_samp
def subsample(self, index_locations): """ Method to get index locations as a sample object :param index_locations: list, tuple, numpy array or integer of index locations :returns: Sample object """ if isinstance(index_locations, list) or \ isinstance(index_locations, tuple) or \ isinstance(index_locations, np.ndarray) or \ isinstance(index_locations, int): warnings.simplefilter('ignore') outsamp = Samples() warnings.simplefilter('default') outsamp.x_name = self.x_name outsamp.y_name = self.y_name if isinstance(index_locations, int): loc = np.array([index_locations]) else: loc = np.array(index_locations) outsamp.x = self.x[np.array(loc), :] outsamp.y = self.y[np.array(loc)] outsamp.nsamp = outsamp.x.shape[0] outsamp.index = np.arange(0, outsamp.nsamp) outsamp.nfeat = outsamp.x.shape[1] return outsamp else: raise TypeError( "subsample() method works for list, tuple, numpy array or integer data types only" )
def merge(self, samp): """ Merge two sample sets together column and label names and orders should be the same in the two datasets :param self, samp: """ self.x = np.vstack((self.x, samp.x)) self.y = np.hstack((self.y, samp.y)) self.nsamp = self.x.shape[0] self.index = np.arange(0, self.nsamp) if np.issubdtype(self.x.dtype, np.number): self.xmin = self.x.min(0, initial=-self.max_allow_x) self.xmax = self.x.max(0, initial=self.max_allow_x) if np.issubdtype(self.y.dtype, np.number): self.ymin = self.y.min(initial=-self.max_allow_y) self.ymax = self.y.max(initial=self.max_allow_y)
def delete_column(self, column_id=None, column_name=None): """ Function to remove a data column from the samples object :param column_id: ID (index) of the column :param column_name: Column label or name :return: Samples object with a column removed """ if column_name is None and column_id is None: raise AttributeError('No argument for delete operation') elif column_id is None and column_name is not None: column_id = Sublist(self.x_name) == column_name self.x = np.delete(self.x, column_id, 1) self.x_name = self.x_name.remove(column_id) self.columns = np.arange(0, self.x.shape[1]) self.nvar = self.x.shape[1] self.nfeat = self.x.shape[1]
def __init__(self, csv_file=None, label_colname=None, x=None, y=None, x_name=None, y_name=None, weights=None, weights_colname=None, use_band_dict=None, max_allow_x=1e13, max_allow_y=1e13, line_limit=None, remove_null=True, **kwargs): """ :param csv_file: csv file that contains the features (training or validation samples) :param label_colname: column in csv file that contains the feature label (output value) :param x: 2d array containing features (samples) without the label :param y: 1d array of feature labels (same order as x) :param x_name: 1d array of feature names (bands). Can be used to select which columns to read from csv file. :param y_name: name of label :param use_band_dict: list of attribute (band) names :param max_allow_x: Maximum allowed values of x :param max_allow_y: Maximum allowed value of y """ self.csv_file = csv_file self.label_colname = label_colname if type(x).__name__ in ('ndarray', 'NoneType'): self.x = x else: self.x = np.array(list(x)) self.x_name = x_name if type(y).__name__ in ('ndarray', 'NoneType'): self.y = y else: self.y = np.array(list(y)) self.y_name = y_name self.weights = weights self.weights_colname = weights_colname self.use_band_dict = use_band_dict self.index = None self.nfeat = None self.xmin = None self.xmax = None self.ymin = None self.ymax = None self.y_hist = None self.y_bin_edges = None self.x_hist = None self.x_bin_edges = None self.max_allow_x = max_allow_x self.max_allow_y = max_allow_y # label name or csv file are provided if (label_colname is not None) and (csv_file is not None): temp = Handler(filename=csv_file).read_from_csv( return_dicts=True, line_limit=line_limit) header = list(temp[0]) # label name doesn't match if label_colname in header: loc = header.index(label_colname) else: raise ValueError("Label name mismatch.\nAvailable names: " + ', '.join(header)) feat_names = header.copy() _ = feat_names.pop(loc) # read from data dictionary if self.x_name is not None and type(self.x_name) in (list, tuple): self.x_name = [ elem for elem in feat_names if elem in self.x_name ] else: self.x_name = feat_names clean_list = [] if remove_null: for elem_dict in temp: val_chk = list((elem in ( None, '', ' ', 'null', 'NULL', '<null>', '<NULL>')) or (elem in (int, float) and np.isnan(elem)) for elem in elem_dict.values()) if any(val_chk): continue else: clean_list.append(elem_dict) else: clean_list = temp self.x = np.array( list( list(samp_dict[feat_name] for feat_name in feat_names) for samp_dict in clean_list)) self.y = np.array( list(samp_dict[label_colname] for samp_dict in clean_list)) self.y_name = label_colname # if band name dictionary is provided if use_band_dict is not None: self.y_name = [use_band_dict[b] for b in self.y_name] elif (label_colname is None) and (csv_file is not None): temp = Handler(filename=csv_file).read_from_csv( return_dicts=True, line_limit=line_limit) clean_list = [] if remove_null: for elem_dict in temp: val_chk = list((elem in ( None, '', ' ', 'null', 'NULL', '<null>', '<NULL>')) or (elem in (int, float) and np.isnan(elem)) for elem in elem_dict.values()) if any(val_chk): continue else: clean_list.append(elem_dict) else: clean_list = temp # read from data dictionary feat_names = list(clean_list[0].keys()) if self.x_name is not None and type(self.x_name) in (list, tuple): self.x_name = [ elem for elem in feat_names if elem in self.x_name ] else: self.x_name = feat_names self.x = np.array( list( list(samp_dict[feat_name] for feat_name in self.x_name) for samp_dict in clean_list)) else: warnings.warn( "Samples class initiated without data file and/or label", category=RuntimeWarning, stacklevel=1) if self.x is not None and self.y is not None: if self.y_name is None: self.y_name = 'y' if (self.x_name is None) or \ (type(self.x_name) not in (list, tuple)) or \ (len(self.x_name) != self.x.shape[1]): self.x_name = list('x{}'.format(str(i + 1)) for i in range(self.x.shape[1])) if weights is None: if weights_colname is not None: if csv_file is not None: # label name doesn't match if any(weights_colname in n for n in self.x_name): loc = self.x_name.index(weights_colname) else: raise ValueError("Weight column name mismatch") self.weights = self.x[:, loc] self.x = np.delete(self.x, loc, 1) else: raise ValueError("No csv_file specified for weights") # if keywords are supplied if kwargs is not None: # columns containing data if 'columns' in kwargs: if type(kwargs['columns']).__name__ == 'list': self.columns = np.array(kwargs['columns']) elif type(kwargs['columns']).__name__ in ('ndarray', 'NoneType'): self.columns = kwargs['columns'] else: self.columns = np.array(list(kwargs['columns'])) else: self.columns = None # IDs of samples if 'ids' in kwargs: self.ids = kwargs['ids'] else: self.ids = None else: self.columns = None self.ids = None if self.x is not None: if self.columns is None: self.columns = np.arange(0, self.x.shape[1]) self.nsamp = self.x.shape[0] self.nvar = self.x.shape[1] self.nfeat = self.x.shape[1] if np.issubdtype(self.x.dtype, np.number): self.xmin = self.x.min(0, initial=max_allow_x) self.xmax = self.x.max(0, initial=max_allow_y) self.index = np.arange(0, self.x.shape[0]) else: self.nsamp = 0 self.nvar = 0 if self.y is not None: if np.issubdtype(self.y.dtype, np.number): self.ymin = self.y.min(initial=-max_allow_y) self.ymax = self.y.max(initial=max_allow_y) if self.y is not None: self.head = '\n'.join( list( str(elem) for elem in [' '.join(list(self.x_name) + [self.y_name])] + list(' '.join( list( str(elem_) for elem_ in self.x[i, :].tolist() + [self.y[i]])) for i in range(10)))) else: self.head = '<empty>'