def add_attribute(self, data, name, metadata=2): """Add values given in argument 'data' to dataset as an attribute 'name' as type 'metadata'. If this attribute already exists, its values are overwritten. 'metadata' should be of type AttributeType (PRIMARY=1, COMPUTED=2). The method increments and returns the version number of the attribute. """ if not (isinstance(data, ndarray) or is_masked_array(data)): data=array(data) name = self.create_and_check_qualified_variable_name(name) short_name = name.get_alias() if short_name in self.get_attribute_names(): self.attribute_boxes[short_name].set_is_in_memory(True) self.attribute_boxes[short_name].set_type(metadata) else: self.attribute_boxes[short_name] = AttributeBox(self, data=[], variable_name=name, type=metadata) if metadata == AttributeType.PRIMARY: self._add_to_primary_attribute_names(short_name) self.df[short_name] = data self.__increment_version(short_name) return self.get_version(short_name)
def aggregate_dataset_over_ids(self, dataset, function='sum', attribute_name=None, constant=None): """Aggregate attribute (given by 'attribute_name') of the given 'dataset' over self by applying the given function. The dataset is expected to have an attribute of the same name as the unique identifier of self. If attribute_name is not given, the argument 'constant' must be given, which is either a scalar or a numpy array. if it is a scalar, for each individual to be counted the constant value is taken into the function; if it is a numpy array of the same size as dataset, the value in the same index as individual is counted into the function. """ workdf = dataset.df if attribute_name == None: if constant == None: self._raise_error(StandardError, "Either 'attribute_name' or 'constant' must be given.") elif isinstance(constant, ndarray): if constant.size <> dataset_id_values.size: self._raise_error(StandardError, "constant's size (%d) must be of the same as dataset's size (%d)" % (constant.size, dataset_id_values.size)) values = constant else: values = resize(array([constant]), dataset.size()) attribute_name = '__constant__' workdf[attribute_name] = values else: if is_masked_array(dataset[attribute_name]): w = where(ma.getmask(dataset[attribute_name])) if len(w)>0: where_masked = w[0] # do not consider those elements in the computation workdf[attribute_name] = ma.filled(workdf[attribute_name], NaN) #logger.start_block('Aggregate Pandas') grouped = workdf.groupby(self.get_id_name())[attribute_name] f = getattr(np, function) res = grouped.aggregate(f) #logger.end_block() return res
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None, replace=False, return_index=False): """generate non-repeat random 2d samples from source_array of sample_size, not including indices appeared in exclude_index; sample column by column, more efficient when there are more rows than columns in sample_size. return elements in source_array of shape sample_size. source_array - the source array to sample from sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling prob_array - the array used to weight sample """ rows, columns = sample_size source_array_size = source_array.size if source_array_size <= columns: logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." % (source_array_size, columns, source_array_size)) return ones((rows,1)) * source_array[newaxis,:] if prob_array is None: prob_array = ones(source_array_size) if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)): raise TypeError, "prob_array must be of type ndarray" # prob_array_size = nonzerocounts(prob_array) # if prob_array_size <= columns: # logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\ # (prob_array_size, columns, prob_array_size)) # return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:] p_array = prob_array p_array_sum = p_array.sum(dtype="float64") if not ma.allclose(p_array_sum, 1.0): if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) p_array = p_array / p_array_sum cum_prob = ncumsum(p_array) sampled_choiceset_index = zeros(sample_size, dtype="int32") - 1 #initialize output if not replace: if exclude_index is not None: if not isinstance(exclude_index, ndarray): try: exclude_index = asarray(exclude_index) except: raise TypeError, "exclude_index must be of type ndarray" if exclude_index.shape[0] <> rows: raise ValueError, "exclude_index should have the same number of rows as sample_size[0]" if rank(exclude_index) == 1: exclude_index = exclude_index[:, newaxis] #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1) #attach exclude_index to the beginning of sampled_choiceset_index else: exclude_index = zeros(shape=(sample_size[0],1), dtype="int32") for j in range(columns): slots_to_be_sampled = arange(rows) #proposed_index = zeros((rows,1)) - 1 while True: proposed_index = probsample_replace(arange(source_array_size), slots_to_be_sampled.size, p_array) try: exclude_array = exclude_index[slots_to_be_sampled,] except: exclude_array = None duplicate_indicator = find_duplicates_others(proposed_index, exclude_array) valid_index = slots_to_be_sampled[duplicate_indicator==0] sampled_choiceset_index[valid_index, j] = proposed_index[duplicate_indicator==0] if nonzerocounts(duplicate_indicator) == 0: break slots_to_be_sampled = slots_to_be_sampled[duplicate_indicator>0] exclude_index = concatenate((exclude_index, take(sampled_choiceset_index,(j,), axis=1)), axis = 1) else: for j in range(columns): sampled_choiceset_index[:,j] = probsample_replace(arange(source_array_size), rows, p_array) if return_index: return sampled_choiceset_index else: return source_array[sampled_choiceset_index]
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None, replace=False, return_index=False): """generate non-repeat random 2d samples from source_array of sample_size, not including indices appeared in exclude_index; sample column by column, more efficient when there are more rows than columns in sample_size. return elements in source_array of shape sample_size. source_array - the source array to sample from sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling prob_array - the array used to weight sample """ rows, columns = sample_size source_array_size = source_array.size if source_array_size <= columns and not replace: logger.log_warning( "There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." % (source_array_size, columns, source_array_size)) if return_index: return ones( (rows, 1), dtype='i') * arange(source_array_size)[newaxis, :] else: return ones((rows, 1), dtype='i') * source_array[newaxis, :] if prob_array is None: prob_array = ones(source_array_size) if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)): raise TypeError, "prob_array must be of type ndarray" # prob_array_size = nonzerocounts(prob_array) # if prob_array_size <= columns: # logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\ # (prob_array_size, columns, prob_array_size)) # return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:] p_array = prob_array p_array_sum = p_array.sum(dtype="float64") if not ma.allclose(p_array_sum, 1.0): if abs(1.0 - p_array_sum) > 0.01: # print this message only if it is a serious difference logger.log_warning( "prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum) p_array = p_array / p_array_sum cum_prob = ncumsum(p_array) sampled_choiceset_index = zeros( sample_size, dtype="int32") - 1 #initialize output if not replace: if exclude_index is not None: if not isinstance(exclude_index, ndarray): try: exclude_index = asarray(exclude_index) except: raise TypeError, "exclude_index must be of type ndarray" if exclude_index.shape[0] <> rows: raise ValueError, "exclude_index should have the same number of rows as sample_size[0]" if rank(exclude_index) == 1: exclude_index = exclude_index[:, newaxis] #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1) #attach exclude_index to the beginning of sampled_choiceset_index else: exclude_index = zeros(shape=(sample_size[0], 1), dtype="int32") for j in range(columns): slots_to_be_sampled = arange(rows) #proposed_index = zeros((rows,1)) - 1 while True: proposed_index = probsample_replace( arange(source_array_size), slots_to_be_sampled.size, p_array) try: exclude_array = exclude_index[slots_to_be_sampled, ] except: exclude_array = None duplicate_indicator = find_duplicates_others( proposed_index, exclude_array) valid_index = slots_to_be_sampled[duplicate_indicator == 0] sampled_choiceset_index[valid_index, j] = proposed_index[ duplicate_indicator == 0] if nonzerocounts(duplicate_indicator) == 0: break slots_to_be_sampled = slots_to_be_sampled[ duplicate_indicator > 0] exclude_index = concatenate( (exclude_index, take(sampled_choiceset_index, (j, ), axis=1)), axis=1) else: for j in range(columns): sampled_choiceset_index[:, j] = probsample_replace( arange(source_array_size), rows, p_array) if return_index: return sampled_choiceset_index else: return source_array[sampled_choiceset_index]