def __init__(self, *args, **kwargs) -> None: """Instantiate a SupervisedSimulation.""" if 'source' in kwargs or (args and hasattr(args[0], 'read')): source = args[0] if len(args) > 0 else kwargs['source'] label_col = args[1] if len(args) > 1 else kwargs.get( "label_col", None) label_type = args[2] if len(args) > 2 else kwargs.get( "label_type", "C") take = args[3] if len(args) > 3 else kwargs.get("take", None) if take is not None: source = Pipes.join(source, Reservoir(take)) if label_col is not None: source = Pipes.join(source, Structure((None, label_col))) params = source.params else: X = args[0] Y = args[1] label_type = args[2] if len(args) > 2 else kwargs.get( "label_type", "C") source = ListSource(list(zip(X, Y))) params = {"source": "[X,Y]"} self._label_type = label_type self._source = source self._params = { **params, "label_type": self._label_type, "type": "SupervisedSimulation" }
def evaluate(self, result_file: str = None) -> Result: """Evaluate the experiment and return the results. Args: result_file: The file for writing and restoring results . """ cb, mp, mt = self.chunk_by, self.processes, self.maxchunksperchild if mp > 1 or mt != 0: CobaContext.logger = DecoratedLogger( [ExceptLog()], CobaContext.logger, [NameLog(), StampLog()]) else: CobaContext.logger = DecoratedLogger([ExceptLog()], CobaContext.logger, [StampLog()]) restored = Result.from_file(result_file) if result_file and Path( result_file).exists() else Result() n_given_learners = len(self._learners) n_given_environments = len(self._environments) if len(restored.experiment) != 0: assert n_given_learners == restored.experiment[ 'n_learners'], "The current experiment doesn't match the given transaction log." assert n_given_environments == restored.experiment[ 'n_environments'], "The current experiment doesn't match the given transaction log." workitems = CreateWorkItems(self._environments, self._learners, self._learner_task, self._environment_task, self._evaluation_task) unfinished = RemoveFinished(restored) chunk = ChunkByTask() if cb == 'task' else ChunkBySource() sink = TransactionIO(result_file) single_process = ProcessWorkItems() multi_process = Pipes.join( chunk, CobaMultiprocessor(ProcessWorkItems(), mp, mt)) process = multi_process if mp > 1 or mt != 0 else single_process try: if not restored: sink.write(["T0", n_given_learners, n_given_environments]) Pipes.join(workitems, unfinished, process, Foreach(sink)).run() except KeyboardInterrupt as e: # pragma: no cover CobaContext.logger.log( "Experiment execution was manually aborted via Ctrl-C") except Exception as ex: # pragma: no cover CobaContext.logger.log(ex) if isinstance(CobaContext.logger, DecoratedLogger): CobaContext.logger = CobaContext.logger.undecorate() return sink.read()
def __init__(self, pre_decorators: Sequence[Filter], logger: Logger, post_decorators: Sequence[Filter]): """Instantiate DecoratedLogger. Args: pre_decorators: A sequence of decorators to be applied before the base logger. logger: The base logger we are decorating. post_decorators: A sequence of decorators to be applied after the base logger. """ self._pre_decorator = Pipes.join(*pre_decorators) if pre_decorators else Identity() self._post_decorators = post_decorators self._logger = logger self._original_sink = self._logger.sink self._logger.sink = Pipes.join(*post_decorators, self._original_sink)
def _construct(item: Any) -> Sequence[Any]: result = None if isinstance(item, str) and item in variables: result = variables[item] if isinstance(item, str) and item not in variables: result = CobaRegistry.construct(item) if isinstance(item, dict): result = CobaRegistry.construct(item) if isinstance(item, list): pieces = list(map(_construct, item)) if hasattr(pieces[0][0], 'read'): result = [ Pipes.join(s, *f) for s in pieces[0] for f in product(*pieces[1:]) ] else: result = sum(pieces, []) if result is None: raise CobaException( f"We were unable to construct {item} in the given environment definition file." ) return result if isinstance( result, collections.abc.Sequence) else [result]
def read(self) -> 'Result': n_lrns = None n_sims = None lrn_rows = {} sim_rows = {} int_rows = {} if isinstance(self._source, ListSource): decoded_source = self._source else: decoded_source = Pipes.join(self._source, Foreach(JsonDecode())) for trx in decoded_source.read(): if not trx: continue if trx[0] == "benchmark": n_lrns = trx[1]["n_learners"] n_sims = trx[1]["n_simulations"] if trx[0] == "S": sim_rows[trx[1]] = trx[2] if trx[0] == "L": lrn_rows[trx[1]] = trx[2] if trx[0] == "I": int_rows[tuple(trx[1])] = trx[2] return Result(n_lrns, n_sims, sim_rows, lrn_rows, int_rows)
def read(self) -> 'Result': n_lrns = None n_sims = None lrn_rows = {} env_rows = {} int_rows = {} if isinstance(self._source, ListSource): decoded_source = self._source else: decoded_source = Pipes.join(self._source, Foreach(JsonDecode())) for trx in decoded_source.read(): if not trx: continue if trx[0] == "experiment": n_lrns = trx[1]["n_learners"] n_sims = trx[1]["n_environments"] if trx[0] == "E": env_rows[trx[1]] = trx[2] if trx[0] == "L": lrn_rows[trx[1]] = trx[2] if trx[0] == "I": int_rows[tuple(trx[1])] = trx[2] return Result(n_lrns, n_sims, env_rows, lrn_rows, int_rows)
def filter(self, items: Iterable[Any]) -> Iterable[Any]: try: with Manager() as manager: stdlog = QueueIO(Queue()) stderr = CobaMultiprocessor.PipeStderr() log_thread = Thread(target=Pipes.join( stdlog, Foreach(CobaContext.logger.sink)).run) log_thread.daemon = True log_thread.start() logger = CobaContext.logger cacher = ConcurrentCacher(CobaContext.cacher, manager.dict(), Lock(), Condition()) store = {"srcsema": Semaphore(2)} filter = CobaMultiprocessor.ProcessFilter( self._filter, logger, cacher, store, stdlog) for item in Multiprocessor(filter, self._processes, self._maxtasksperchild, stderr).filter(items): yield item stdlog.write( None ) #attempt to shutdown the logging process gracefully by sending the poison pill except RuntimeError as e: #pragma: no cover #This happens when importing main causes this code to run again coba_exit(str(e))
def test_with_pipes(self): self.assertEqual({ 'type': 'DummyEnvironment', "shuffle": 1 }, SafeEnvironment( Pipes.join(DummyEnvironment(), Shuffle(1))).params)
def __init__(self, source: Union[str, Source[Iterable[str]]]) -> None: """Instantiate a LibsvmSource. Args: source: The data source. Accepts either a string representing the source location or another Source. """ source = UrlSource(source) if isinstance(source, str) else source reader = LibsvmReader() self._source = Pipes.join(source, reader)
def filter( self, filter: Union[EnvironmentFilter, Sequence[EnvironmentFilter]] ) -> 'Environments': """Apply filters to each environment currently in Environments.""" filters = filter if isinstance(filter, collections.abc.Sequence) else [filter] self._environments = [ Pipes.join(e, f) for e in self._environments for f in filters ] return self
def read(self) -> Iterable[Tuple[Any, Any]]: """Read and parse the openml source.""" try: dataset_description = self._get_dataset_description(self._data_id) if dataset_description['status'] == 'deactivated': raise CobaException( f"Openml {self._data_id} has been deactivated. This is often due to flags on the data." ) feature_descriptions = self._get_feature_descriptions( self._data_id) task_descriptions = self._get_task_descriptions(self._data_id) is_ignore = lambda r: (r['is_ignore'] == 'true' or r[ 'is_row_identifier'] == 'true' or r['data_type'] not in ['numeric', 'nominal']) ignore = [ self._name_cleaning(f['name']) for f in feature_descriptions if is_ignore(f) ] target = self._name_cleaning( self._get_target_for_problem_type(task_descriptions)) if target in ignore: ignore.pop(ignore.index(target)) def row_has_missing_values(row): row_values = row._values.values() if isinstance( row, SparseWithMeta) else row._values return "?" in row_values or "" in row_values source = ListSource( self._get_dataset_lines(dataset_description["file_id"], None)) reader = ArffReader(cat_as_str=self._cat_as_str) drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values) structure = Structure([None, target]) return Pipes.join(source, reader, drop, structure).read() except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except CobaException: #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed) raise except Exception: #if something unexpected went wrong clear the cache just in case it was corrupted somehow self._clear_cache() raise
def test_two_environment_tasks(self): filter = CountFilter() src1 = CountReadSimulation() sim1 = Pipes.join(src1, filter) sim2 = Pipes.join(src1, filter) task1 = ObserveTask() task2 = ObserveTask() items = [ WorkItem(0, None, sim1, None, task1), WorkItem(1, None, sim2, None, task2) ] transactions = list(ProcessWorkItems().filter(items)) self.assertEqual(len(task1.observed[1]), 1) self.assertEqual(len(task2.observed[1]), 1) self.assertEqual(task1.observed[1][0].context, (0,0)) self.assertEqual(task2.observed[1][0].context, (0,1)) self.assertEqual(['T2', 0, {}], transactions[0]) self.assertEqual(['T2', 1, {}], transactions[1])
def __init__(self, source: Union[str, Source[Iterable[str]]], has_header: bool = False, **dialect) -> None: """Instantiate a CsvSource. Args: source: The data source. Accepts either a string representing the source location or another Source. has_header: Indicates if the CSV files has a header row. """ source = UrlSource(source) if isinstance(source, str) else source reader = CsvReader(has_header, **dialect) self._source = Pipes.join(source, reader)
def test_two_eval_tasks_one_source_two_env(self): filter = CountFilter() src1 = CountReadSimulation() sim1 = Pipes.join(src1, filter) sim2 = Pipes.join(src1, filter) lrn1 = ModuloLearner("1") lrn2 = ModuloLearner("2") task1 = ObserveTask() task2 = ObserveTask() items = [ WorkItem(0, 0, sim1, lrn1, task1), WorkItem(1, 1, sim2, lrn2, task2) ] transactions = list(ProcessWorkItems().filter(items)) self.assertEqual(len(task1.observed[1]), 1) self.assertEqual(len(task2.observed[1]), 1) self.assertEqual(task1.observed[1][0].context, (0,0)) self.assertEqual(task2.observed[1][0].context, (0,1)) self.assertEqual(['T3', (0,0), []], transactions[0]) self.assertEqual(['T3', (1,1), []], transactions[1])
def __init__(self, source: Union[str, Source[Iterable[str]]], cat_as_str: bool = False, skip_encoding: bool = False, lazy_encoding: bool = True, header_indexing: bool = True) -> None: """Instantiate an ArffSource. Args: source: The data source. Accepts either a string representing the source location or another Source. cat_as_str: Indicates that categorical features should be encoded as a string rather than one hot encoded. skip_encoding: Indicates that features should not be encoded (this means all features will be strings). lazy_encoding: Indicates that features should be encoded lazily (this can save time if rows will be dropped). header_indexing: Indicates that header data should be preserved so rows can be indexed by header name. """ source = UrlSource(source) if isinstance(source, str) else source reader = ArffReader(cat_as_str, skip_encoding, lazy_encoding, header_indexing) self._source = Pipes.join(source, reader)
def test_environment_pipe_statistics_dense(self): env = Pipes.join(SupervisedSimulation([[1,2],[3,4]]*10,["A","B"]*10), Shuffle(1)) task = SimpleEnvironmentTask() self.assertEqual({**env.params}, task.process(env,env.read()))
def sink(self, sink: Sink[str]): self._original_sink = sink self._logger.sink = Pipes.join(*self._post_decorators, sink)
def test_str(self): self.assertEqual( 'DummyEnvironment(shuffle=1)', str(SafeEnvironment(Pipes.join(DummyEnvironment(), Shuffle(1))))) self.assertEqual('DummyEnvironment', str(SafeEnvironment(DummyEnvironment())))