def test_new(): GIVEN("a data container") data_container = DataContainer() WHEN("we call new") new_container = data_container.new() THEN("a data container is returned") assert type(new_container) is type(data_container) THEN("the new data container is a different instance") assert new_container is not data_container
def test_sum_columns(): GIVEN("a simple set of data and a container") data = __get_test_dict() data_container = DataContainer(data) WHEN("we sum columns A and B to give D") data_container.sum_columns(output="D", columns=["A", "C"]) THEN("the resulting data is correct") col_d = data_container.get_column(name="D") for row in range(len(data.get("C"))): assert col_d[row] == not_a_number_to_number( data.get("A")[row]) + not_a_number_to_number(data.get("C")[row])
def test_get_ids_for_model_data(): GIVEN("a data handler with some data and two fixed probabilities") GIVEN("a data handler and the directory and file name of a test file") directory = "./data/29184567" file_name = "1.156230797.txt" file = HistoricalExternalAPIFileHander(directory=directory, file=file_name) record = file.get_file_as_list()[0] market_start_time = file.get_market_start_time() adapter = ExternalAPIMarketRecordAdapter( market_start_time=market_start_time) mediator = MockMediator() handler = DataHandler( mediator=mediator, adapter=adapter, container=DataContainer(), ) handler.process_data(record) WHEN( "we set the probabilities of two items and get the ids required for the next model run" ) ids = handler.get_unique_ids() for runner_id in ids[0:2]: handler._set_probability(runner_id=runner_id, probability=0.1) ids.pop(0) THEN("the list omits the items which have fixed probabilities") model_ids = handler._get_ids_for_model_data() assert model_ids == ids assert len(model_ids) < len(handler.get_unique_ids())
def test_get_last_column_entry(): GIVEN("a simple set of data and a container") data = __get_test_dict() data_container = DataContainer(data) WHEN("we get the last entry for A") last_a = data_container.get_last_column_entry("A") THEN("the correct value is returned") assert last_a == data.get("A")[-1] assert isinstance(last_a, float) WHEN("we get the last entry for B") last_b = data_container.get_last_column_entry("B") THEN("the correct value is returned") assert last_b == data.get("B")[-1] assert isinstance(last_b, str) WHEN("we get the last entry for C") last_c = data_container.get_last_column_entry("C") THEN("the correct value is returned") assert is_not_a_number(last_c)
def test_add_rows(): GIVEN("two data containers containing simple data") data = __get_test_dict() data_container = DataContainer(data) data_to_add = {"B": [12], "A": [6], "D": ["extra column"]} data_container_to_add = DataContainer(data_to_add) WHEN("we add the first to the second") data_container.add_rows(data_container_to_add) THEN("we have the correct number of rows and columns") unique_keys = set().union(data.keys(), data_to_add.keys()) assert data_container.get_row_count() == max([ len(data.get(key) or []) + len(data_to_add.get(key) or []) for key in unique_keys ]) assert data_container.get_column_count() == len(unique_keys)
def __init__( self, market_id, external_api, market_start_time, data_adapter=None, bank=5000, data=None, models=None, orders=None, ): self.__market_id = market_id self.external_api: Colleague = external_api adapter = data_adapter or ExternalAPIMarketRecordAdapter( market_start_time=market_start_time) self.data: Colleague = data or DataHandler( mediator=self, adapter=adapter, container=DataContainer(), ) self.models: Colleague = models or ModelHandler( mediator=self, wls_model=WeightedLinearRegression()) self.orders: Colleague = orders or OrdersHandler(mediator=self, bank=bank) self.__recipients = { "external data fetched": self.data.process_data, "data added to container": self.models.run_models, "models have results": self.orders.get_new_orders, "new orders": self.external_api.post_order, "orders posted": self.__delegate_posted_orders, "market closed": self.__exit, "no data provided multiple times": self.__exit, "finished processing": self.__finished, }
def test_confirm_market_closed(): GIVEN("a data handler and the directory and file name of a test file") adapter = ExternalAPIMarketRecordAdapter( market_start_time="2019-01-01T00:00:00.000Z") mediator = MockMediator() handler = DataHandler(mediator=mediator, adapter=adapter, container=DataContainer()) WHEN("we check if the market is closed") closed = handler._confirm_market_closed() THEN("it is not") assert not closed GIVEN("the handler's container has the required column" + " but it does not indicate that the market is closed") closed_record = handler._container.new( data={("closed_indicator", ""): [0]}) handler._container.add_rows(container=closed_record) WHEN("we check if the market is closed") closed = handler._confirm_market_closed() THEN("it is not") assert not closed GIVEN( "the handler's container has the required column indicating that the market is closed" ) closed_record = handler._container.new( data={("closed_indicator", ""): [1]}) handler._container.add_rows(container=closed_record) WHEN("we check if the market is closed") closed = handler._confirm_market_closed() THEN("it is") assert closed
def test_has_column(): GIVEN("a simple set of data and a container") data = __get_test_dict() data_container = DataContainer(data) WHEN("we check if the container has the column A") true = data_container.has_column("A") THEN("it does") assert true WHEN("we check if the container has the column wwwwweeeeeeeeeeeee") true = data_container.has_column("wwwwweeeeeeeeeeeee") THEN("it does not") assert not true GIVEN("an empty container") data_container = DataContainer() WHEN("we check if the container has the column ('closed_indicator','')") true = data_container.has_column(("closed_indicator", "")) THEN("it does not") assert not true
def test_removed_runner(): GIVEN( "the directory and file name of a test file which contains a removed runner" ) directory = "./data/29201704" file_name = "1.156695742.txt" file = HistoricalExternalAPIFileHander(directory=directory, file=file_name) file_data = file.get_file_as_list() market_start_time = file.get_market_start_time() adapter = ExternalAPIMarketRecordAdapter( market_start_time=market_start_time) number_runners = __get_number_runners(data=file_data) mediator = MockMediator() WHEN("we feed the data into a handler one record at a time") handler = DataHandler( mediator=mediator, adapter=adapter, container=DataContainer(), ) for i, record in enumerate(file_data): handler.process_data(record) THEN("the incoming record was processed") number_records_processed = i + 1 THEN("the data container the correct number of records") assert handler._container.get_row_count() == number_records_processed WHEN("we have finished") THEN("the data container has the correct number of columns") assert handler._container.get_column_count() == __get_number_columns( number_runners) THEN("the data container has the same number of records as the raw data") assert handler._container.get_row_count() == len(file_data) THEN("the correct number of runners are contained in the object") assert len(handler.get_unique_ids()) == number_runners
def test_fixed_probability(mock_notify): GIVEN("a data handler and the directory and file name of a test file") directory = "./data/29451865" file_name = "1.162069495.txt" file = HistoricalExternalAPIFileHander(directory=directory, file=file_name) file_data = file.get_file_as_list() market_start_time = file.get_market_start_time() number_runners = __get_number_runners(data=file_data) unfixed_items = number_runners fixed_items = 0 adapter = ExternalAPIMarketRecordAdapter( market_start_time=market_start_time) pricer = PriceHandler() metadata = MetadataHandler() mediator = MockMediator() correct_probability = 1 number_records_processed = 0 WHEN("we feed the data into a handler one record at a time") handler = DataHandler( mediator=mediator, adapter=adapter, container=DataContainer(), ) for i, record in enumerate(file_data): number_records_processed = i + 1 if number_records_processed % 10 == 0: WHEN("we randomly fix the probability of an item") id_to_fix = handler._get_ids_for_model_data()[0] fixed_probability = round( handler._container.get_last_column_entry( name=("compositional_sp_probability", id_to_fix)), 4, ) handler._set_probability(runner_id=id_to_fix, probability=fixed_probability) correct_probability -= fixed_probability unfixed_items -= 1 fixed_items += 1 fixed_probability_ids = handler._get_fixed_probability_ids() THEN("the list of fixed probability ids is the correct length") assert len(fixed_probability_ids) == fixed_items handler.process_data(record) THEN("the handler's data has the correct number of records") assert handler._container.get_row_count() == number_records_processed THEN( "the mediator's notify method was called with the correct parameters" ) model_data = handler._get_model_data() args, kwargs = mock_notify.call_args assert args == () assert kwargs.get("data") == model_data assert kwargs.get("event") == "data added to container" THEN( "there is a record in the model data for each of the unfixed items" ) assert len(model_data) == unfixed_items test_record = { each.get("id"): each for each in adapter.convert(record).get("items") } total_sp_probability = 0 total_ex_probability = 0 for data in model_data: THEN("each of the items in the model data has an non-zero id") runner_id = data.get("id") assert isinstance(runner_id, int) assert runner_id > 0 THEN("the items probability has not been fixed") assert runner_id not in fixed_probability_ids test_item = test_record.get(runner_id) THEN("the data has the correct combined_back_size") combined_back_size = data.get("combined_back_size" + metadata.get_point_in_time_suffix()) assert combined_back_size == (test_item.get("sp_back_size") + test_item.get("ex_back_size")) THEN( "the data contains the compositional sp probability which is between 0 and 1" ) compositional_sp_probability = data.get( "compositional_sp_probability" + metadata.get_point_in_time_suffix()) total_sp_probability += compositional_sp_probability assert 1 > compositional_sp_probability > 0 THEN( "the data contains the compositional ex probability which is between 0 and 1" ) compositional_ex_average_probability = data.get( "compositional_ex_average_probability" + metadata.get_point_in_time_suffix()) total_ex_probability += compositional_ex_average_probability assert 1 > compositional_ex_average_probability > 0 THEN("the data contains the correct offered price") offered_price = data.get("ex_offered_back_price" + metadata.get_point_in_time_suffix()) assert offered_price > 0 assert offered_price == test_item.get("ex_offered_back_price") THEN("the data contains the correct returns price") returns_price = data.get("ex_offered_back_price_mc" + metadata.get_point_in_time_suffix()) assert returns_price > 0 assert returns_price == pricer.remove_commission( test_item.get("ex_offered_back_price")) THEN( "the sp back price time series data returned is of the correct length" ) compositional_sp_back_price_ts = ( data.get("compositional_sp_back_price" + metadata.get_time_series_suffix()) or []) assert len( compositional_sp_back_price_ts) == number_records_processed THEN( "the last record of the time series data matches the probability" ) assert almost_equal(compositional_sp_back_price_ts[-1], 1 / compositional_sp_probability) THEN( "the extract time time series data returned is of the correct length" ) extract_time_ts = (data.get("extract_time" + metadata.get_time_series_suffix()) or []) assert len(extract_time_ts) == number_records_processed for j, extract_time in enumerate(extract_time_ts): if j > 0: THEN("the times in the series are ascending") assert extract_time > extract_time_ts[j - 1] THEN( "the combined back size time series data returned is of the correct length" ) combined_back_size_ts = ( data.get("combined_back_size" + metadata.get_time_series_suffix()) or []) assert len(combined_back_size_ts) == number_records_processed THEN( "the last entry in the time series is the same as point in time combined_back_size" ) assert combined_back_size_ts[-1] == combined_back_size for j, combined_back_size in enumerate(combined_back_size_ts): if j > 0: THEN("the sizes in the series are ascending") assert combined_back_size >= combined_back_size_ts[j - 1] THEN("the total ex and sp probabilities from the model_data sum to 1") assert almost_equal(total_sp_probability, correct_probability) assert almost_equal(total_ex_probability, correct_probability) WHEN("we have finished") THEN("the data container has the correct number of columns") assert handler._container.get_column_count() == __get_number_columns( number_runners) THEN("the data container has the same number of records as the raw data") assert handler._container.get_row_count() == len(file_data) THEN("the correct number of runners are contained in the object") assert len(handler.get_unique_ids()) == number_runners THEN( "the correct number of fixed probabilities are contained in the object" ) assert len(handler._get_fixed_probability_ids()) == round_down( number_records_processed / 10)
def test_column_group_name(): GIVEN("some data that contains ids in the keys and" + " a container with the column group's name of the ids to be id") data = { ("col1", 123): [1, 2, 3, 4], ("col1", 456): [1, 2, 3, 4], ("col2", 123): [1, 2, 3, 4], ("col2", 456): [1, 2, 3, 4], } data_container = DataContainer(data) data_container.set_column_group_name(name="id", level=1) WHEN("we get the column names from the id group") ids = data_container.get_column_group_values(name="id") THEN("the correct ids are returned") assert lists_are_equal(ids, [123, 456]) GIVEN("some data that contains ids in the keys and a" + " container with the column group's name of the ids to be id") data = { ("col1", 123): [1, 2, 3, 4], ("col1", 456): [1, 2, 3, 4], ("col2", 123): [1, 2, 3, 4], ("col2", 456): [1, 2, 3, 4], } data_container = DataContainer(data) data_container.set_column_group_name(names=["variable", "id"]) WHEN("we get the column names from the id group") ids = data_container.get_column_group_values(name="id") THEN("the correct ids are returned") assert lists_are_equal(ids, [123, 456]) WHEN("we get the column names from the variable group") variables = data_container.get_column_group_values(name="variable") THEN("the correct variables are returned") assert lists_are_equal(variables, ["col1", "col2"]) GIVEN("some simple data and a container with the column name set to vars") data = {"col1": [1, 2, 3, 4], "col2": [1, 2, 3, 4]} data_container = DataContainer(data) data_container.set_column_group_name(name="vars") WHEN("we get the columns from the id group ") columns = data_container.get_column_group_values(name="vars") THEN("the correct columns are returned") assert lists_are_equal(columns, ["col1", "col2"])
def test_set_index(): GIVEN("a simple set of data and a container") data = __get_test_dict() data_container = DataContainer(data) WHEN("we set the index to be A") data_container.set_index(columns=["A"]) assert data_container.get_column_count() == len(data.keys()) - 1 assert lists_are_equal(data_container.get_index(), data.get("A")) GIVEN("a simple set of data and a container") data = __get_test_dict() data_container = DataContainer(data) WHEN("we set the index to be A and B") data_container.set_index(columns=["A", "B"]) assert data_container.get_column_count() == len(data.keys()) - 2 assert lists_are_equal( data_container.get_index(), [(data.get("A")[row], data.get("B")[row]) for row in range(len(data.get("B")))], )