def evaluate(self, learners: Sequence[Learner[_C,_A]], transaction_log:str = None, seed:int = None) -> Result: """Collect observations of a Learner playing the benchmark's simulations to calculate Results. Args: factories: See the base class for more information. Returns: See the base class for more information. """ benchmark_learners = [ BenchmarkLearner(learner, seed) for learner in learners ] #type: ignore restored = Result.from_transaction_log(transaction_log) task_source = TaskSource(self._simulation_pipes, benchmark_learners, restored) task_to_transactions = TaskToTransactions(self._ignore_raise) transaction_sink = TransactionSink(transaction_log, restored) n_given_learners = len(benchmark_learners) n_given_simulations = len(self._simulation_pipes) if len(restored.benchmark) != 0: assert n_given_learners == restored.benchmark['n_learners' ], "The currently evaluating benchmark doesn't match the given transaction log" assert n_given_simulations == restored.benchmark['n_simulations'], "The currently evaluating benchmark doesn't match the given transaction log" preamble_transactions = [] preamble_transactions.append(Transaction.version(TransactionPromote.CurrentVersion)) preamble_transactions.append(Transaction.benchmark(n_given_learners, n_given_simulations)) preamble_transactions.extend(Transaction.learners(benchmark_learners)) mp = self._processes if self._processes else ExecutionContext.Config.processes mt = self._maxtasksperchild if self._maxtasksperchild else ExecutionContext.Config.maxtasksperchild Pipe.join(MemorySource(preamble_transactions), [] , transaction_sink).run(1,None) Pipe.join(task_source , [task_to_transactions], transaction_sink).run(mp,mt) return transaction_sink.result
def test_multiprocess_singletask(self): source = MemorySource(list(range(4))) sink = MemorySink() Pipe.join(source, [Pipe_Tests.ProcessNameFilter()], sink).run(2, 1) self.assertEqual(len(set(sink.items)), 4)
def test_single_process_multitask(self): source = MemorySource(list(range(10))) sink = MemorySink() Pipe.join(source, [Pipe_Tests.ProcessNameFilter()], sink).run() self.assertEqual(sink.items, ['MainProcess'] * 10)
def from_transaction_log(filename: Optional[str]) -> 'Result': """Create a Result from a transaction file.""" if filename is None or not Path(filename).exists(): return Result() Pipe.join(DiskSource(filename), [JsonDecode(), TransactionPromote(), JsonEncode()], DiskSink(filename, 'w')).run() return Result.from_transactions(Pipe.join(DiskSource(filename), [JsonDecode()]).read())
def test_logging(self): actual_logs = [] ExecutionContext.Logger = UniversalLogger( lambda msg, end: actual_logs.append((msg, end))) source = MemorySource(list(range(4))) sink = MemorySink() Pipe.join(source, [Pipe_Tests.ProcessNameFilter()], sink).run(2, 1) self.assertEqual(len(actual_logs), 4) self.assertEqual(sink.items, [l[0][20:] for l in actual_logs])
def __init__(self, source : Source[Simulation], filters : Sequence[Filter[Simulation,Union[Simulation, BatchedSimulation]]], source_description :str = "", filter_descriptions:Sequence[str] = []) -> None: if isinstance(source, BenchmarkSimulation): self._source = source._source #type: ignore self._filter = Pipe.FiltersFilter(list(source._filter._filters)+list(filters)) #type: ignore self.source_description = source.source_description or source_description #type: ignore self.filter_descriptions = list(source.filter_descriptions) + list(filter_descriptions) #type: ignore else: self._source = source self._filter = Pipe.FiltersFilter(filters) self.source_description = source_description self.filter_descriptions = list(filter_descriptions)
def _process_task(self, task) -> Iterable[Any]: simulation_ids = task[0] learner_ids = task[1] learners = task[2] simulation_pipes = task[3] simulation_source = simulation_pipes[0]._source #we only need one source since we group by sources when making tasks simulation_filters = [ simulation_pipe._filter for simulation_pipe in simulation_pipes ] collapsed_pipe = Pipe.join(simulation_source, [ForeachFilter(simulation_filters)]) written_simulations = [] try: for simulation_id, learner_id, learner, pipe, simulation in zip(simulation_ids, learner_ids, learners, simulation_pipes, collapsed_pipe.read()): batches = simulation.interaction_batches interactions = list(chain.from_iterable(batches)) if simulation_id not in written_simulations: written_simulations.append(simulation_id) yield Transaction.simulation(simulation_id, source = pipe.source_description, filters = pipe.filter_descriptions, interaction_count = len(interactions), batch_count = len(batches), context_size = int(median(self._context_sizes(interactions))), action_count = int(median(self._action_counts(interactions)))) learner = deepcopy(learner) learner.init() if len(batches) > 0: Ns, Rs = zip(*[ self._process_batch(batch, simulation.reward, learner) for batch in batches ]) yield Transaction.batch(simulation_id, learner_id, N=list(Ns), reward=list(Rs)) except KeyboardInterrupt: raise except Exception as e: ExecutionContext.Logger.log_exception(e, "unhandled exception:") if not self._ignore_raise: raise e
def __init__(self, transaction_log: Optional[str], restored: Result) -> None: self._sink = Pipe.join([JsonEncode()], DiskSink(transaction_log)) if transaction_log else MemorySink() self._sink = Pipe.join([TransactionIsNew(restored)], self._sink)
def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]: #placing some of these at the top would cause circular references from coba.data.pipes import Pipe from coba.data.encoders import Encoder, NumericEncoder, OneHotEncoder, StringEncoder from coba.data.filters import CsvReader, LabeledCsvCleaner data_id = self._data_id md5_checksum = self._md5_checksum openml_api_key = ExecutionContext.Config.openml_api_key data_description_url = f'https://www.openml.org/api/v1/json/data/{data_id}' type_description_url = f'https://www.openml.org/api/v1/json/data/features/{data_id}' if openml_api_key is not None: data_description_url += f'?api_key={openml_api_key}' type_description_url += f'?api_key={openml_api_key}' descr = json.loads(''.join( HttpSource(data_description_url, '.json', None, 'descr').read()))["data_set_description"] if descr['status'] == 'deactivated': raise Exception( f"Openml {data_id} has been deactivated. This is often due to flags on the data." ) types = json.loads(''.join( HttpSource(type_description_url, '.json', None, 'types').read()))["data_features"]["feature"] headers: List[str] = [] encoders: List[Encoder] = [] ignored: List[bool] = [] target: str = "" for tipe in types: headers.append(tipe['name']) ignored.append(tipe['is_ignore'] == 'true' or tipe['is_row_identifier'] == 'true') if tipe['is_target'] == 'true': target = tipe['name'] if tipe['data_type'] == 'numeric': encoders.append(NumericEncoder()) elif tipe['data_type'] == 'nominal' and tipe[ 'is_target'] == 'false': encoders.append(OneHotEncoder(singular_if_binary=True)) elif tipe['data_type'] == 'nominal' and tipe['is_target'] == 'true': encoders.append(OneHotEncoder()) else: encoders.append(StringEncoder()) if isinstance(encoders[headers.index(target)], NumericEncoder): target = self._get_classification_target(data_id, openml_api_key) ignored[headers.index(target)] = False encoders[headers.index(target)] = OneHotEncoder() csv_url = f"http://www.openml.org/data/v1/get_csv/{descr['file_id']}" source = HttpSource(csv_url, ".csv", md5_checksum, f"openml {data_id}") reader = CsvReader() cleaner = LabeledCsvCleaner(target, headers, encoders, ignored, True) feature_rows, label_rows = Pipe.join(source, [reader, cleaner]).read() return list(feature_rows), list(label_rows)
def test_exception_singleprocess(self): source = MemorySource(list(range(4))) sink = MemorySink() with self.assertRaises(Exception): Pipe.join(source, [Pipe_Tests.ExceptionFilter()], sink).run()