def create_compound_test_data_set(response_variable, explanatory_variables, explanatory_ranges): """Create a test data set for CompoundLinearModel""" number_of_obs = 10 data_set = create_linear_model_test_data_set(response_variable, explanatory_variables[0], number_of_obs, explanatory_ranges[0]) for i in range(1, len(explanatory_variables)): tmp_data_set = create_linear_model_test_data_set( response_variable, explanatory_variables[i], number_of_obs, explanatory_ranges[i]) tmp_df = tmp_data_set.get_data() tmp_range_index_start = data_set.get_data().index[-1] + 1 tmp_range_index_stop = tmp_range_index_start + number_of_obs tmp_df.set_index(pd.RangeIndex(tmp_range_index_start, tmp_range_index_stop), inplace=True) tmp_data_set = DataManager(tmp_df, tmp_data_set.get_origin()) data_set = data_set.add_data_manager(tmp_data_set) return data_set
def test_deepcopy(self): """Test the deepcopy functionality of instances of the DataManager class""" df = create_random_dataframe() dm1 = DataManager(df) dm2 = copy.deepcopy(dm1) self.assertTrue(dm1.equals(dm2)) self.assertIsNot(dm1, dm2)
def __init__(self, constituent_df, surrogate_df, model_list, min_samples=30, max_extrapolation=0.1, match_time=30, p_thres=0.05): """ Initialize a HierarchicalModel :param constituent_data: :type constituent_data: DataManager :param surrogate_data: :type surrogate_data: DataManger :param model_list: """ #HierarchicalModel.pad_data(surrogate_df) self._surrogate_data = DataManager(surrogate_df) self._constituent_data = DataManager(constituent_df) self._model_list = model_list self.match_time = match_time self.max_extrapolation = max_extrapolation self.min_samples = min_samples self.p_thres = p_thres self._create_models()
def calculate_acoustic_parameters(self, processing_parameters): """ :param processing_parameters: :return: """ # calculate sediment attenuation coefficient and mean sediment corrected backscatter self._measured_backscatter_data = self._raw_backscatter_data.calc_measured_backscatter( processing_parameters) self._water_corrected_backscatter_data = \ WaterCorrectedBackscatterData.calc_water_corrected_backscatter(self._measured_backscatter_data) self._sediment_attenuation_coefficient = \ SedimentAttenuationCoefficient.calc_sediment_attenuation_coefficient(self._water_corrected_backscatter_data) self._sediment_corrected_backscatter_data = \ SedimentCorrectedBackscatterData.calc_sediment_corrected_backscatter(self._water_corrected_backscatter_data, self._sediment_attenuation_coefficient) mean_sediment_corrected_backscatter = \ self._sediment_corrected_backscatter_data.calc_mean_sediment_corrected_backscatter() mean_scb_data_origin = create_origin_from_data_frame( mean_sediment_corrected_backscatter.get_data(), self._sediment_corrected_backscatter_data.get_origin()) acoustic_parameter_dm = DataManager( mean_sediment_corrected_backscatter.get_data(), mean_scb_data_origin) # add the sediment corrected backscatter to the self._acoustic_parameters = acoustic_parameter_dm.add_data( self._sediment_attenuation_coefficient.get_sac(), self._sediment_attenuation_coefficient.get_origin()) return self._acoustic_parameters
def calc_measured_backscatter(self, processing_parameters): """Returns the measured backscatter based on requirements in processing_parameters. :param processing_parameters: :return: """ # get the measured backscatter data measured_backscatter_df = self._calc_measured_backscatter( self._data_manager.get_data(), self._configuration_parameters, processing_parameters) # add the vertical beam and temperature data measured_backscatter_origin = create_origin_from_data_frame( measured_backscatter_df, self._data_manager.get_origin()) measured_backscatter_data_manager = DataManager( measured_backscatter_df, measured_backscatter_origin) measured_backscatter_data_manager = measured_backscatter_data_manager.add_data( self.get_variable('Temp'), self.get_variable_origin('Temp')) measured_backscatter_data_manager = \ measured_backscatter_data_manager.add_data(self.get_variable('Vbeam'), self.get_variable_origin('Vbeam')) return MeasuredBackscatterData(measured_backscatter_data_manager, self._configuration_parameters, processing_parameters, self.get_cell_range())
def test_get_data_float_index_specify_step(self): """Test DataManager.get_data() a float type data index and the step parameter specified""" data_start = 0 data_stop = 10 index_start = 0. index_stop = 10. columns = ['x', 'y'] num_rows = 6 df1 = create_linspace_dataframe(data_start, data_stop, index_start, index_stop, columns, num_rows) dm = DataManager(df1) num_rows_1 = num_rows step_1 = (index_stop - index_start) / (num_rows_1 - 1) results_1 = dm.get_data(index_step=step_1) # with the same step, the DataFrames should be equal pd.testing.assert_frame_equal(results_1, df1) # get a linearly created DataFrame with twice the amount of rows, interpolate at the correct step, and # compare the results num_rows_2 = 2*num_rows_1 - 1 df2 = create_linspace_dataframe(data_start, data_stop, index_start, index_stop, columns, num_rows * 2 - 1) step_2 = (index_stop - index_start) / (num_rows_2 - 1) results_2 = dm.get_data(index_step=step_2) pd.testing.assert_frame_equal(results_2, df2)
def test_get_data(self): """A simple test case of the DataManager.get_data method""" # create a DataManager from a DataFrame with random data data_df = create_random_dataframe() data_manager = DataManager(data_df) # get data from the DataManager results_df = data_manager.get_data() # test if the data manager returns a copy of the DataFrame pd.testing.assert_frame_equal(data_manager.get_data(), data_df) self.assertIsNot(results_df, data_df)
def create_linear_model_test_data_set(response_variable, explanatory_variables, number_of_obs=50, explanatory_range=(0.01, 10)): """ :param response_variable: :param explanatory_variables: :param number_of_obs: :param explanatory_range: :return: """ # find the raw explanatory variables and create a random DataFrame with the number of raw explanatory variables raw_explanatory_variables = list( set([ raw_var for _, raw_var in [find_raw_variable(var) for var in explanatory_variables] ])) explanatory_data = np.random.uniform(explanatory_range[0], explanatory_range[1], size=(number_of_obs, len(raw_explanatory_variables))) explanatory_df = pd.DataFrame(data=explanatory_data, columns=raw_explanatory_variables) # get an exogenous DataFrame to calculate the response variable exog_df = get_exog_df(explanatory_df, explanatory_variables) # get the beta vector and error term number_of_parameters = len(explanatory_variables) + 1 beta_vector = np.random.uniform(1, 10, size=(number_of_parameters, 1)) error_term = np.random.normal(0, 0.1, size=(number_of_obs, 1)) # calculate the response variable and create a DataFrame response_transform, raw_response_variable = find_raw_variable( response_variable) response_inverse_transform = INVERSE_TRANSFORM_FUNCTIONS[ response_transform] response_data = response_inverse_transform( np.dot(exog_df, beta_vector) + error_term) response_df = pd.DataFrame(data=response_data, columns=[raw_response_variable]) # create a DataFrame containing response and explanatory data test_data_df = pd.concat([response_df, explanatory_df], axis=1) test_data_origin = DataManager.create_data_origin(test_data_df, __file__) # return a DataManager with the regression data return DataManager(test_data_df, test_data_origin)
def add_data(self, other, keep_curr_obs=None): """ :param other: ProcessedADVMSedimentData :param keep_curr_obs: :return: """ if not self._configuration_parameters.is_compatible(other.get_configuration_parameters()) and \ self._processing_parameters.is_compatible(other.get_processing_parameters()) and \ isinstance(other, type(self)): raise BackscatterDataIncompatibleException( "Backscatter data sets are incompatible") # other_data_manager = other.get_data_manager() other_data = other.get_data() other_origin = other.get_origin() other_data_manager = DataManager(other_data, other_origin) combined_data_manager = self._data_manager.add_data_manager( other_data_manager, keep_curr_obs=keep_curr_obs) return type(self)(combined_data_manager, self._configuration_parameters, self._processing_parameters)
def calc_sediment_corrected_backscatter(cls, water_corrected_backscatter, sediment_attenuation_coefficient): """Calculate the sediment corrected backscatter from the water corrected backscatter :param water_corrected_backscatter: :param sediment_attenuation_coefficient: :return: """ wcb_df = water_corrected_backscatter.get_backscatter_data() cell_range_df = water_corrected_backscatter.get_cell_range() sac_df = sediment_attenuation_coefficient.get_sac() configuration_parameters = water_corrected_backscatter.get_configuration_parameters( ) sediment_corrected_backscatter_df = cls._calc_sediment_corrected_backscatter( wcb_df, sac_df, configuration_parameters, cell_range_df) water_corrected_backscatter_origin = water_corrected_backscatter.get_origin( ) data_origin = create_origin_from_data_frame( sediment_corrected_backscatter_df, water_corrected_backscatter_origin) data_manager = DataManager(sediment_corrected_backscatter_df, data_origin) return cls(data_manager, water_corrected_backscatter.get_configuration_parameters(), water_corrected_backscatter.get_processing_parameters(), water_corrected_backscatter.get_cell_range())
def add_data(self, other, keep_curr_obs=None): """Adds other RawBackscatterData instance to self. Throws exception if other RawBackscatterData object is incompatible with self. An exception will be raised if keep_curr_obs=None and concurrent observations exist for variables. :param other: RawBackscatterData object to be added :type other: RawBackscatterData :param keep_curr_obs: {None, True, False} Flag to indicate whether or not to keep current observations. :return: Merged RawBackscatterData object """ # test compatibility of other data set if not self._configuration_parameters.is_compatible(other.get_configuration_parameters()) and \ isinstance(other, type(self)): raise BackscatterDataIncompatibleException( "ADVM data sets are incompatible") other_data = other.get_data() other_origin = other.get_origin() other_data_manager = DataManager(other_data, other_origin) combined_data_manager = self._data_manager.add_data_manager( other_data_manager, keep_curr_obs=keep_curr_obs) return type(self)(combined_data_manager, self._configuration_parameters)
def test_get_data_datetime_index_specify_step(self): """Test DataManager.get_data() with a DateTimeIndex type data index and step parameter specified""" data_start = 0. data_stop = 10. index_start = np.datetime64('2018-01-01') index_step = np.timedelta64(15*60*1000, 'ms') num_rows = 7 index_stop = index_start + index_step * num_rows columns = ['x', 'y'] df1 = create_linspace_dataframe(data_start, data_stop, index_start, index_stop, columns, num_rows) dm = DataManager(df1) pd.testing.assert_frame_equal(dm.get_data(), df1) num_rows_2 = 12 df2 = create_linspace_dataframe(data_start, data_stop, index_start, index_stop, columns, num_rows_2) index_step = df2.index[1] - df2.index[0] pd.testing.assert_frame_equal(dm.get_data(index_step=index_step), df2)
def calc_water_corrected_backscatter(cls, measured_backscatter): """Calculate the water corrected backscatter from the measured backscatter :param measured_backscatter: :return: """ water_corrected_backscatter_df = cls._calc_water_corrected_backscatter( measured_backscatter) water_corrected_backscatter_origin = create_origin_from_data_frame( water_corrected_backscatter_df, measured_backscatter.get_origin()) wcb_data_manager = DataManager(water_corrected_backscatter_df, water_corrected_backscatter_origin) wcb_data_manager = wcb_data_manager.add_data( measured_backscatter.get_variable('Temp'), measured_backscatter.get_variable_origin('Temp')) return cls(wcb_data_manager, measured_backscatter.get_configuration_parameters(), measured_backscatter.get_processing_parameters(), measured_backscatter.get_cell_range())
def test_init_read_tab_delimited_datetime_index_datetime(self): test_data_file_path_results = os.path.join(current_path, 'data', 'model', 'TestDataframeTimestamp', 'test_timestamp_results.txt') test_data_file_path_test = os.path.join(current_path, 'data', 'model', 'TestDataframeTimestamp', 'test_timestamp_datetime.txt') df = pd.read_table(test_data_file_path_results, sep='\t',) df = df.set_index(pd.DatetimeIndex(df['DateTime'])) df = df.drop(df.columns[0], axis=1) data_manager = DataManager.read_tab_delimited_data(test_data_file_path_test) pd.testing.assert_frame_equal(data_manager.get_data(), df)
def create_origin_from_data_frame(acoustic_df, data_origin): """Create a new origin from information in data_origin describing the variables in acoustic_df""" data_sources = set(data_origin['origin']) new_data_origin = pd.DataFrame(columns=['variable', 'origin']) for source in data_sources: tmp_origin = DataManager.create_data_origin(acoustic_df, source) new_data_origin = new_data_origin.append(tmp_origin) new_data_origin.reset_index(drop=True, inplace=True) return new_data_origin
def test_to_hdf_path(self): """Test the DataManager.to_hdf() and DataManager.read_hdf() methods when saving with a file path""" key = '/dm' data = create_random_dataframe() data_origin = DataManager.create_data_origin(data, 'test') dm1 = DataManager(data, data_origin) dm1.to_hdf(self.temp_hdf_path, key) dm2 = DataManager.read_hdf(self.temp_hdf_path, key) self.assertTrue(dm1.equals(dm2))
def test_add_data_manager_simple(self): """Test a simple case of DataManager.add_data_manager() usage""" # create a data manager with random data df = create_random_dataframe(number_of_rows=50) dm = DataManager(df) # split the data up and add the results together dm1 = DataManager(df.iloc[:25]) dm2 = DataManager(df.iloc[25:]) dm_add_result = dm1.add_data_manager(dm2) # test that the original data set and the add result are equal self.assertTrue(dm.equals(dm_add_result))
def _init_test_model(self, test_case_parameters): """Initialize and return a test model""" # get the test case name test_case_df = self._load_test_case_data() expected_fitted_results = test_case_df.filter(regex='Fitted *') # drop the fitted values and create a data set to pass to the LinearModel data_set_df = test_case_df.drop(expected_fitted_results.keys(), axis=1) data_set = DataManager(data_set_df) # initialize a model without specifying the response and explanatory variables model = test_case_parameters['test_class']( data_set, **test_case_parameters['init_kwargs']) return model
def from_advm_data(cls, advm_data): """""" advm_df = advm_data.get_data() backscatter_df = advm_df.filter(regex=cls._bs_data_columns_regex) backscatter_origin = advm_data.get_origin() backscatter_data_manager = DataManager(backscatter_df, backscatter_origin) # apply slant angle correction configuration_parameters = advm_data.get_configuration_parameters() slant_angle = configuration_parameters['Slant Angle'] cell_range_df = advm_data.get_cell_range() / np.cos( np.radians(slant_angle)) return cls(backscatter_data_manager, configuration_parameters, cell_range_df)
def test_to_hdf_buf(self): """Test the DataManager.to_hdf() and DataManager.read_hdf() methods when saving with a pd.HDFStore instance""" key = '/dm/' data = create_random_dataframe() data_origin = DataManager.create_data_origin(data, 'test') dm1 = DataManager(data, data_origin) with pd.HDFStore(self.temp_hdf_path) as store: dm1.to_hdf(store, key) with pd.HDFStore(self.temp_hdf_path) as store: dm2 = DataManager.read_hdf(store, key) self.assertTrue(dm1.equals(dm2))
def test_init_read_tab_delimited_file_no_datetime(self): """Test the initialization of a DataManager instance using read_tab_delimited_data() from a file with no date/time information """ # pick a data file from the model test data set test_data_file_path = os.path.join(current_path, 'data', 'model', 'TestMultipleOLSModelInit', 'test_model_init.txt') # read the file into a DataManager data_manager = DataManager.read_tab_delimited_data(test_data_file_path) # read the file into a DataFrame df = pd.read_table(test_data_file_path, sep='\t') df.sort_index(axis=1, inplace=True) # make sure the DataManager's data and DataFrame are equal pd.testing.assert_frame_equal(data_manager.get_data(), df)
def calc_mean_sediment_corrected_backscatter(self): """ :return: """ sediment_corrected_backscatter = self.get_data() mean_sediment_corrected_backscatter = pd.DataFrame( sediment_corrected_backscatter.mean(axis=1), columns=['MeanSCB'], dtype=np.float) data_origin = create_origin_from_data_frame( mean_sediment_corrected_backscatter, self.get_origin()) data_manager = DataManager(mean_sediment_corrected_backscatter, data_origin) return ProcessedData(data_manager, self.get_configuration_parameters(), self.get_processing_parameters())
def test_equals(self): """Test the DataManager.equals() and __eq__() methods""" df1 = create_random_dataframe() dm1 = DataManager(df1) dm2 = DataManager(df1) self.assertTrue(dm1.equals(dm2)) self.assertTrue(dm1 == dm2) df3 = create_random_dataframe() dm3 = DataManager(df3) self.assertFalse(dm1.equals(dm3)) self.assertFalse(dm1 == dm3) self.assertTrue(dm1 != dm3)
def test_init_without_origin(self): """Test the initialization method of DataManager with no origin DataFrame""" data_df = create_random_dataframe() variable_names = data_df.keys() origin_data = [[var, np.nan] for var in variable_names] nan_origin_df = pd.DataFrame(data=origin_data, columns=['variable', 'origin']) data_manager_without_origin = DataManager(data_df) # test that data is being stored correctly pd.testing.assert_frame_equal(data_manager_without_origin.get_data(), data_df) pd.testing.assert_frame_equal(data_manager_without_origin.get_origin(), nan_origin_df) # test that DataFrames aren't the same instance self.assertFalse(data_manager_without_origin.get_data() is data_df) self.assertFalse(data_manager_without_origin.get_origin() is nan_origin_df)
def calc_sediment_attenuation_coefficient(cls, water_corrected_backscatter): """Calculates the sediment attenuation coefficient :param water_corrected_backscatter: :return: """ wcb_df = water_corrected_backscatter.get_backscatter_data() cell_range_df = water_corrected_backscatter.get_cell_range() sediment_attenuation_coefficient = cls._calc_sediment_attenuation_coefficient( wcb_df, cell_range_df) sac_origin = create_origin_from_data_frame( sediment_attenuation_coefficient, water_corrected_backscatter.get_origin()) sac_data_manager = DataManager(sediment_attenuation_coefficient, sac_origin) return cls(sac_data_manager, water_corrected_backscatter.get_configuration_parameters(), water_corrected_backscatter.get_processing_parameters())