Exemplo n.º 1
0
    def __init__(self, *args, **kwargs) -> None:
        """Instantiate a SupervisedSimulation."""

        if 'source' in kwargs or (args and hasattr(args[0], 'read')):
            source = args[0] if len(args) > 0 else kwargs['source']
            label_col = args[1] if len(args) > 1 else kwargs.get(
                "label_col", None)
            label_type = args[2] if len(args) > 2 else kwargs.get(
                "label_type", "C")
            take = args[3] if len(args) > 3 else kwargs.get("take", None)
            if take is not None: source = Pipes.join(source, Reservoir(take))
            if label_col is not None:
                source = Pipes.join(source, Structure((None, label_col)))
            params = source.params

        else:
            X = args[0]
            Y = args[1]
            label_type = args[2] if len(args) > 2 else kwargs.get(
                "label_type", "C")
            source = ListSource(list(zip(X, Y)))
            params = {"source": "[X,Y]"}

        self._label_type = label_type
        self._source = source
        self._params = {
            **params, "label_type": self._label_type,
            "type": "SupervisedSimulation"
        }
Exemplo n.º 2
0
    def evaluate(self, result_file: str = None) -> Result:
        """Evaluate the experiment and return the results.

        Args:
            result_file: The file for writing and restoring results .
        """
        cb, mp, mt = self.chunk_by, self.processes, self.maxchunksperchild

        if mp > 1 or mt != 0:
            CobaContext.logger = DecoratedLogger(
                [ExceptLog()], CobaContext.logger,
                [NameLog(), StampLog()])
        else:
            CobaContext.logger = DecoratedLogger([ExceptLog()],
                                                 CobaContext.logger,
                                                 [StampLog()])

        restored = Result.from_file(result_file) if result_file and Path(
            result_file).exists() else Result()

        n_given_learners = len(self._learners)
        n_given_environments = len(self._environments)

        if len(restored.experiment) != 0:
            assert n_given_learners == restored.experiment[
                'n_learners'], "The current experiment doesn't match the given transaction log."
            assert n_given_environments == restored.experiment[
                'n_environments'], "The current experiment doesn't match the given transaction log."

        workitems = CreateWorkItems(self._environments, self._learners,
                                    self._learner_task, self._environment_task,
                                    self._evaluation_task)
        unfinished = RemoveFinished(restored)
        chunk = ChunkByTask() if cb == 'task' else ChunkBySource()
        sink = TransactionIO(result_file)

        single_process = ProcessWorkItems()
        multi_process = Pipes.join(
            chunk, CobaMultiprocessor(ProcessWorkItems(), mp, mt))
        process = multi_process if mp > 1 or mt != 0 else single_process

        try:
            if not restored:
                sink.write(["T0", n_given_learners, n_given_environments])
            Pipes.join(workitems, unfinished, process, Foreach(sink)).run()

        except KeyboardInterrupt as e:  # pragma: no cover
            CobaContext.logger.log(
                "Experiment execution was manually aborted via Ctrl-C")

        except Exception as ex:  # pragma: no cover
            CobaContext.logger.log(ex)

        if isinstance(CobaContext.logger, DecoratedLogger):
            CobaContext.logger = CobaContext.logger.undecorate()

        return sink.read()
Exemplo n.º 3
0
    def __init__(self, pre_decorators: Sequence[Filter], logger: Logger, post_decorators: Sequence[Filter]):
        """Instantiate DecoratedLogger.
        
        Args:
            pre_decorators: A sequence of decorators to be applied before the base logger.
            logger: The base logger we are decorating.
            post_decorators: A sequence of decorators to be applied after the base logger.
        """

        self._pre_decorator   = Pipes.join(*pre_decorators) if pre_decorators else Identity()
        self._post_decorators = post_decorators
        self._logger          = logger
        self._original_sink   = self._logger.sink
        self._logger.sink     = Pipes.join(*post_decorators, self._original_sink)
Exemplo n.º 4
0
        def _construct(item: Any) -> Sequence[Any]:
            result = None

            if isinstance(item, str) and item in variables:
                result = variables[item]

            if isinstance(item, str) and item not in variables:
                result = CobaRegistry.construct(item)

            if isinstance(item, dict):
                result = CobaRegistry.construct(item)

            if isinstance(item, list):
                pieces = list(map(_construct, item))

                if hasattr(pieces[0][0], 'read'):
                    result = [
                        Pipes.join(s, *f) for s in pieces[0]
                        for f in product(*pieces[1:])
                    ]
                else:
                    result = sum(pieces, [])

            if result is None:
                raise CobaException(
                    f"We were unable to construct {item} in the given environment definition file."
                )

            return result if isinstance(
                result, collections.abc.Sequence) else [result]
Exemplo n.º 5
0
    def read(self) -> 'Result':
        n_lrns   = None
        n_sims   = None
        lrn_rows = {}
        sim_rows = {}
        int_rows = {}

        if isinstance(self._source, ListSource):
            decoded_source = self._source
        else:
            decoded_source = Pipes.join(self._source, Foreach(JsonDecode()))

        for trx in decoded_source.read():

            if not trx: continue

            if trx[0] == "benchmark": 
                n_lrns = trx[1]["n_learners"]
                n_sims = trx[1]["n_simulations"]

            if trx[0] == "S": 
                sim_rows[trx[1]] = trx[2]

            if trx[0] == "L": 
                lrn_rows[trx[1]] = trx[2]

            if trx[0] == "I": 
                int_rows[tuple(trx[1])] = trx[2]

        return Result(n_lrns, n_sims, sim_rows, lrn_rows, int_rows)
Exemplo n.º 6
0
    def read(self) -> 'Result':

        n_lrns   = None
        n_sims   = None
        lrn_rows = {}
        env_rows = {}
        int_rows = {}

        if isinstance(self._source, ListSource):
            decoded_source = self._source
        else:
            decoded_source = Pipes.join(self._source, Foreach(JsonDecode()))

        for trx in decoded_source.read():

            if not trx: continue

            if trx[0] == "experiment": 
                n_lrns = trx[1]["n_learners"]
                n_sims = trx[1]["n_environments"]

            if trx[0] == "E": 
                env_rows[trx[1]] = trx[2]

            if trx[0] == "L": 
                lrn_rows[trx[1]] = trx[2]

            if trx[0] == "I": 
                int_rows[tuple(trx[1])] = trx[2]

        return Result(n_lrns, n_sims, env_rows, lrn_rows, int_rows)
Exemplo n.º 7
0
    def filter(self, items: Iterable[Any]) -> Iterable[Any]:

        try:

            with Manager() as manager:

                stdlog = QueueIO(Queue())
                stderr = CobaMultiprocessor.PipeStderr()

                log_thread = Thread(target=Pipes.join(
                    stdlog, Foreach(CobaContext.logger.sink)).run)
                log_thread.daemon = True
                log_thread.start()

                logger = CobaContext.logger
                cacher = ConcurrentCacher(CobaContext.cacher, manager.dict(),
                                          Lock(), Condition())
                store = {"srcsema": Semaphore(2)}

                filter = CobaMultiprocessor.ProcessFilter(
                    self._filter, logger, cacher, store, stdlog)

                for item in Multiprocessor(filter, self._processes,
                                           self._maxtasksperchild,
                                           stderr).filter(items):
                    yield item

                stdlog.write(
                    None
                )  #attempt to shutdown the logging process gracefully by sending the poison pill

        except RuntimeError as e:  #pragma: no cover
            #This happens when importing main causes this code to run again
            coba_exit(str(e))
Exemplo n.º 8
0
 def test_with_pipes(self):
     self.assertEqual({
         'type': 'DummyEnvironment',
         "shuffle": 1
     },
                      SafeEnvironment(
                          Pipes.join(DummyEnvironment(),
                                     Shuffle(1))).params)
Exemplo n.º 9
0
    def __init__(self, source: Union[str, Source[Iterable[str]]]) -> None:
        """Instantiate a LibsvmSource.

        Args:
            source: The data source. Accepts either a string representing the source location or another Source.
        """
        source = UrlSource(source) if isinstance(source, str) else source
        reader = LibsvmReader()
        self._source = Pipes.join(source, reader)
Exemplo n.º 10
0
 def filter(
     self, filter: Union[EnvironmentFilter, Sequence[EnvironmentFilter]]
 ) -> 'Environments':
     """Apply filters to each environment currently in Environments."""
     filters = filter if isinstance(filter,
                                    collections.abc.Sequence) else [filter]
     self._environments = [
         Pipes.join(e, f) for e in self._environments for f in filters
     ]
     return self
Exemplo n.º 11
0
    def read(self) -> Iterable[Tuple[Any, Any]]:
        """Read and parse the openml source."""
        try:
            dataset_description = self._get_dataset_description(self._data_id)

            if dataset_description['status'] == 'deactivated':
                raise CobaException(
                    f"Openml {self._data_id} has been deactivated. This is often due to flags on the data."
                )

            feature_descriptions = self._get_feature_descriptions(
                self._data_id)
            task_descriptions = self._get_task_descriptions(self._data_id)

            is_ignore = lambda r: (r['is_ignore'] == 'true' or r[
                'is_row_identifier'] == 'true' or r['data_type'] not in
                                   ['numeric', 'nominal'])

            ignore = [
                self._name_cleaning(f['name']) for f in feature_descriptions
                if is_ignore(f)
            ]
            target = self._name_cleaning(
                self._get_target_for_problem_type(task_descriptions))

            if target in ignore: ignore.pop(ignore.index(target))

            def row_has_missing_values(row):
                row_values = row._values.values() if isinstance(
                    row, SparseWithMeta) else row._values
                return "?" in row_values or "" in row_values

            source = ListSource(
                self._get_dataset_lines(dataset_description["file_id"], None))
            reader = ArffReader(cat_as_str=self._cat_as_str)
            drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values)
            structure = Structure([None, target])

            return Pipes.join(source, reader, drop, structure).read()

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except CobaException:
            #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed)
            raise

        except Exception:
            #if something unexpected went wrong clear the cache just in case it was corrupted somehow
            self._clear_cache()
            raise
Exemplo n.º 12
0
    def test_two_environment_tasks(self):

        filter = CountFilter()
        src1   = CountReadSimulation()
        sim1   = Pipes.join(src1, filter)
        sim2   = Pipes.join(src1, filter)

        task1 = ObserveTask()
        task2 = ObserveTask()

        items = [ WorkItem(0, None, sim1, None, task1), WorkItem(1, None, sim2, None, task2) ]

        transactions = list(ProcessWorkItems().filter(items))

        self.assertEqual(len(task1.observed[1]), 1)
        self.assertEqual(len(task2.observed[1]), 1)

        self.assertEqual(task1.observed[1][0].context, (0,0))
        self.assertEqual(task2.observed[1][0].context, (0,1))

        self.assertEqual(['T2', 0, {}], transactions[0])
        self.assertEqual(['T2', 1, {}], transactions[1])
Exemplo n.º 13
0
    def __init__(self,
                 source: Union[str, Source[Iterable[str]]],
                 has_header: bool = False,
                 **dialect) -> None:
        """Instantiate a CsvSource.

        Args:
            source: The data source. Accepts either a string representing the source location or another Source.
            has_header: Indicates if the CSV files has a header row. 
        """
        source = UrlSource(source) if isinstance(source, str) else source
        reader = CsvReader(has_header, **dialect)
        self._source = Pipes.join(source, reader)
Exemplo n.º 14
0
    def test_two_eval_tasks_one_source_two_env(self):

        filter = CountFilter()
        src1   = CountReadSimulation()
        sim1   = Pipes.join(src1, filter)
        sim2   = Pipes.join(src1, filter)
        lrn1   = ModuloLearner("1")
        lrn2   = ModuloLearner("2")

        task1 = ObserveTask()
        task2 = ObserveTask()

        items = [ WorkItem(0, 0, sim1, lrn1, task1), WorkItem(1, 1, sim2, lrn2, task2) ]

        transactions = list(ProcessWorkItems().filter(items))

        self.assertEqual(len(task1.observed[1]), 1)
        self.assertEqual(len(task2.observed[1]), 1)

        self.assertEqual(task1.observed[1][0].context, (0,0))
        self.assertEqual(task2.observed[1][0].context, (0,1))

        self.assertEqual(['T3', (0,0), []], transactions[0])
        self.assertEqual(['T3', (1,1), []], transactions[1])
Exemplo n.º 15
0
    def __init__(self,
                 source: Union[str, Source[Iterable[str]]],
                 cat_as_str: bool = False,
                 skip_encoding: bool = False,
                 lazy_encoding: bool = True,
                 header_indexing: bool = True) -> None:
        """Instantiate an ArffSource.

        Args:
            source: The data source. Accepts either a string representing the source location or another Source.
            cat_as_str: Indicates that categorical features should be encoded as a string rather than one hot encoded. 
            skip_encoding: Indicates that features should not be encoded (this means all features will be strings).
            lazy_encoding: Indicates that features should be encoded lazily (this can save time if rows will be dropped).
            header_indexing: Indicates that header data should be preserved so rows can be indexed by header name. 
        """
        source = UrlSource(source) if isinstance(source, str) else source
        reader = ArffReader(cat_as_str, skip_encoding, lazy_encoding,
                            header_indexing)
        self._source = Pipes.join(source, reader)
Exemplo n.º 16
0
    def test_environment_pipe_statistics_dense(self):

        env  = Pipes.join(SupervisedSimulation([[1,2],[3,4]]*10,["A","B"]*10), Shuffle(1))
        task = SimpleEnvironmentTask()

        self.assertEqual({**env.params}, task.process(env,env.read()))
Exemplo n.º 17
0
 def sink(self, sink: Sink[str]):
     self._original_sink = sink
     self._logger.sink   = Pipes.join(*self._post_decorators, sink)
Exemplo n.º 18
0
 def test_str(self):
     self.assertEqual(
         'DummyEnvironment(shuffle=1)',
         str(SafeEnvironment(Pipes.join(DummyEnvironment(), Shuffle(1)))))
     self.assertEqual('DummyEnvironment',
                      str(SafeEnvironment(DummyEnvironment())))