def test_custom_stat_report_for_customised_index(self): mock_df = pd.DataFrame(data=np.random.rand(10, 2), columns=["attr1", "attr2"]) mock_object = data_exploration.Exploration(data=mock_df) expected = "given_name attr1" outcome = mock_object.custom_stat_report(data=mock_df, name="given_name") self.assertEqual(outcome.index[0], expected)
def test_descriptive_stats_report_for_output_data_type(self): mock_df = pd.DataFrame(data=np.random.rand(10, 2), columns=["attr1", "attr2"]) mock_object = data_exploration.Exploration(data=mock_df) expected = pd.DataFrame self.assertEqual( type(mock_object.descriptive_stats_report(given_data=mock_df)), expected)
def test_distribution_comparison_for_3_pandas_Series(self): mock_df = pd.DataFrame(data=np.random.rand(10, 4), columns=["attr1", "attr2", "attr3", "attr4"]) mock_object = data_exploration.Exploration(data=mock_df) self.assertIsNone( mock_object.distribution_comparison(series1=mock_df["attr1"], series2=mock_df["attr2"], series3=mock_df["attr3"]))
def test_split_train_test_for_full_and_partial_mode(self): mock_df = pd.DataFrame(data=np.random.rand(100, 2), columns=["attr1", "attr2"]) mock_object_full = data_exploration.Exploration(data=mock_df, running_mode="full") mock_object_partial = data_exploration.Exploration( data=mock_df, running_mode="partial") expected_full = [70, 30] expected_partial = [5, 2] self.assertEqual(len(mock_object_full.split_train_test()[0]), expected_full[0]) self.assertEqual(len(mock_object_full.split_train_test()[1]), expected_full[1]) self.assertEqual(len(mock_object_partial.split_train_test()[0]), expected_partial[0]) self.assertEqual(len(mock_object_partial.split_train_test()[1]), expected_partial[1])
def test_descriptive_stats_report_for_statistics_results(self): mock_df = pd.DataFrame(data=np.random.rand(10, 2), columns=["attr1", "attr2"]) mock_object = data_exploration.Exploration(data=mock_df) expected = mock_df["attr1"].mean() self.assertEqual( mock_object.descriptive_stats_report(given_data=mock_df).iloc[0, 0], expected)
def test_scaler_min_max(self): mock_df = pd.DataFrame(data=[10, 20, 30], columns=["attr1"]) mock_object = data_exploration.Exploration(data=mock_df) expected = [1, 0.5, 0] self.assertAlmostEqual( mock_object.scaler_min_max(data=mock_df)[:, 0].max(), expected[0]) self.assertTrue(expected[1] in mock_object.scaler_min_max( data=mock_df)) self.assertAlmostEqual( mock_object.scaler_min_max(data=mock_df)[:, 0].min(), expected[2])
def test_plot_series_but_ignore_date_for_2_datasets(self): date_times = pd.date_range(start="04-Jan-2021", end="15-Jan-2021", freq="H") mock_df = pd.DataFrame(data=np.array([ np.random.uniform(1, 3, len(date_times)), np.random.uniform(3, 5, len(date_times)) ]).T, columns=["attr1", "attr2"], index=date_times) mock_object = data_exploration.Exploration(data=mock_df) self.assertIsNone( mock_object.plot_series_but_ignore_date(series1=mock_df["attr1"], series2=mock_df["attr2"]))
def test_apply_sliding_window_for_type_outcome_length_and_links_data_target( self): data_length = 200 window_length = parameters.General_Params().slid_win_length mock_df = pd.DataFrame(data=np.random.rand(data_length, 3), columns=["attr1", "Close", "attr2"]) mock_object = data_exploration.Exploration(data=mock_df) df_out_data, df_out_targets = mock_object.apply_sliding_window( scaled_data=np.array(mock_df)) array_out_data, array_out_targets = mock_object.apply_sliding_window( scaled_data=np.array(mock_df)) self.assertEqual(len(df_out_data[0]), window_length) self.assertEqual(len(df_out_data[-1]), window_length) self.assertEqual(df_out_data[1]["Close"].iloc[-1], df_out_targets[0]) self.assertEqual(df_out_data[5]["Close"].iloc[-1], df_out_targets[4]) self.assertTrue(all(df_out_data[0] == array_out_data[0]))
from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError from core import data_preparation, data_exploration, learning_rate_strategy, model_developement df = pd.read_csv(os.path.join(os.getcwd(), "data", "GSK per min.csv")) """ Data Preparation """ prepare = data_preparation.Preparation(data=df) prepare.set_timestamps_as_index() prepare.sort_by_timestamp() prepare.drop_unused() prepare.transform_to_float() prepare.time_series_fillna() prepare.plot_daily_prices(show=True) prepare.plot_prices_and_projection(show=True) """ Data Exploration """ explore = data_exploration.Exploration(data=prepare.data) train_set, test_set = explore.split_train_test() explore.distribution_comparison(train_set=train_set["Close"], test_set=test_set["Close"], show=True) explore.plot_series_but_ignore_date(train_set=train_set["Close"], test_set=test_set["Close"], show=True) train_report = explore.custom_stat_report(data=train_set, name="Train") test_report = explore.custom_stat_report(data=train_set, name="Test") explore.box_plots(data=train_set, show=True) scaled_train_data = explore.scaler_min_max(data=train_set) scaled_test_data = explore.scaler_min_max(data=test_set) sliding_window_train_data, sliding_window_train_target = explore.turn_dfs_into_arrays( given_data=scaled_train_data) sliding_window_test_data, sliding_window_test_target = explore.turn_dfs_into_arrays(
def test_box_plots_for_dataframe_with_4_attributes(self): mock_df = pd.DataFrame(data=np.random.rand(10, 4), columns=["attr1", "attr2", "attr3", "attr4"]) mock_object = data_exploration.Exploration(data=mock_df) self.assertIsNone(mock_object.box_plots(data=mock_df))
def test_color_list_for_output_colors(self): mock_list = data_exploration.Exploration(data=pd.DataFrame( data=np.random.rand(10), columns=["attr1"])).color_list() expected = ["red", "aqua"] self.assertEqual(mock_list[2], expected[0]) self.assertEqual(mock_list[5], expected[1])