def test_join_flattens_filters(self): filter1 = Pipe.join([IdentityFilter(), IdentityFilter()]) filter2 = Pipe.join([filter1, IdentityFilter()]) filter3 = Pipe.join([filter2, filter2]) self.assertEqual(6, len(filter3._filters))
def test_run(self): source = MemorySource(list(range(10))) sink = MemorySink() Pipe.join(source, [Pipe_Tests.ProcessNameFilter()], sink).run() self.assertEqual(sink.items, ['MainProcess'] * 10)
def from_file(filename: str) -> 'Result': """Create a Result from a transaction file.""" #Why is this here??? This is really confusing in practice #if filename is None or not Path(filename).exists(): return Result() json_encode = Cartesian(JsonEncode()) json_decode = Cartesian(JsonDecode()) Pipe.join(DiskSource(filename), [json_decode, ResultPromote(), json_encode], DiskSink(filename, 'w')).run() return Result.from_transactions(Pipe.join(DiskSource(filename), [json_decode]).read())
def test_repr3(self): filters = [Pipe_Tests.ReprFilter(), Pipe_Tests.ReprFilter()] sink = Pipe_Tests.ReprSink() expected_repr = "ReprFilter,ReprFilter,ReprSink" self.assertEqual(expected_repr, str(Pipe.join(filters, sink)))
def test_repr2(self): source = Pipe_Tests.ReprSource([0, 1, 2]) filters = [Pipe_Tests.ReprFilter(), Pipe_Tests.ReprFilter()] expected_repr = "ReprSource,ReprFilter,ReprFilter" self.assertEqual(expected_repr, str(Pipe.join(source, filters)))
def test_repr4(self): filter = Pipe.FiltersFilter( [Pipe_Tests.ReprFilter(), Pipe_Tests.ReprFilter()]) expected_repr = "ReprFilter,ReprFilter" self.assertEqual(expected_repr, str(filter))
def test_repr1(self): source = Pipe_Tests.ReprSource([0, 1, 2]) filters = [Pipe_Tests.ReprFilter(), Pipe_Tests.ReprFilter()] sink = Pipe_Tests.ReprSink() expected_repr = "ReprSource,ReprFilter,ReprFilter,ReprSink" self.assertEqual(expected_repr, str(Pipe.join(source, filters, sink)))
def __init__(self, reader: Filter[Iterable[str], Any], source: Union[str, Source[Iterable[str]]], label_column: Union[str, int], with_header: bool = True) -> None: self._reader = reader if isinstance(source, str) and source.startswith('http'): self._source = Pipe.join(HttpSource(source), [ResponseToLines()]) elif isinstance(source, str): self._source = DiskSource(source) else: self._source = source self._label_column = label_column self._with_header = with_header self._interactions = cast(Optional[Sequence[Interaction]], None)
def _construct(item:Any) -> Sequence[Any]: result = None if isinstance(item, str) and item in variables: result = variables[item] if isinstance(item, str) and item not in variables: result = CobaRegistry.construct(item) if isinstance(item, dict): result = CobaRegistry.construct(item) if isinstance(item, list): if any([ isinstance(i,list) for i in item ]): raise Exception("Recursive structures are not supported in benchmark simulation configs.") pieces = list(map(_construct, item)) result = [ Pipe.join(s, f) for s in pieces[0] for f in product(*pieces[1:])] if result is None: raise Exception(f"We were unable to construct {item} in the given benchmark file.") return result if isinstance(result, collections.Sequence) else [result]
def __init__(self, simulations: Sequence[Simulation], shuffle: Sequence[Optional[int]] = [None], take: int = None) -> None: """Instantiate a Benchmark. Args: simulations: The collection of simulations to benchmark against. shuffle: A collection of seeds to use for simulation shuffling. A seed of `None` means no shuffle will be applied. take: The number of interactions to take from each simulation for evaluation. """ ... sources: List[Simulation] = simulations filters: List[Sequence[Filter[Iterable[Interaction], Iterable[Interaction]]]] = [] if shuffle != [None]: filters.append([Shuffle(seed) for seed in shuffle]) if take is not None: filters.append([Take(take)]) if len(filters) > 0: simulation_sources = [ cast(Source[Simulation], Pipe.join(s, f)) for s, f in product(sources, product(*filters)) ] else: simulation_sources = list(sources) self._simulations: Sequence[Source[Simulation]] = simulation_sources self._processes: Optional[int] = None self._maxtasksperchild: Optional[int] = None self._maxtasksperchild_set: bool = False self._chunk_by: Optional[str] = None
def filter(self, items: Iterable[Any]) -> Iterable[Any]: if len(self._filters) == 0: return items try: with Manager() as manager: stdout_queue = manager.Queue() #type: ignore stdlog_queue = manager.Queue() #type: ignore stdout_writer, stdout_reader = QueueSink( stdout_queue), QueueSource(stdout_queue) stdlog_writer, stdlog_reader = QueueSink( stdlog_queue), QueueSource(stdlog_queue) class MyPool(multiprocessing.pool.Pool): _missing_error_definition_error_is_new = True def _join_exited_workers(self): for worker in self._pool: if worker.exitcode == 1000 and MyPool._missing_error_definition_error_is_new: #this is a hack... This only works so long as we just #process one job at a time... This is true in our case. #this is necessary because multiprocessing can get stuck #waiting for failed workers and that is frustrating for users. MyPool._missing_error_definition_error_is_new = False message = ( "Coba attempted to evaluate your benchmark in multiple processes but the pickle module was unable to " "find all the definitions needed to pass the tasks to the processes. The two most common causes of " "this error are: 1) a learner or simulation is defined in a Jupyter Notebook cell or 2) a necessary " "class definition exists inside the `__name__=='__main__'` code block in the main execution script. In " "either case there are two simple solutions: 1) evalute your benchmark in a single processed with no " "limit on child tasks or 2) define all you classes in a separate python file that is imported when " "evaluating.") CobaConfig.Logger.log(message) if worker.exitcode is not None and worker.exitcode != 0: #A worker exited in an uncontrolled manner and was unable to clean its job #up. We therefore mark one of the jobs as "finished" but failed to prevent an #infinite wait on a failed job to finish that is actually no longer running. list(self._cache.values())[0]._set( None, (False, None)) return super()._join_exited_workers() with MyPool(self._processes, maxtasksperchild=self._maxtasksperchild) as pool: # handle not picklable (this is handled by done_or_failed) # handle empty list (this is done by checking result.ready()) # handle exceptions in process (unhandled exceptions can cause children to hang so we pass them to stderr) # handle ctrl-c without hanging # > don't call result.get when KeyboardInterrupt has been hit # > handle EOFError,BrokenPipeError errors with queue since ctr-c kills manager # handle AttributeErrors. These occure when... (this is handled by shadowing several pool methods) # > a class that is defined in a Jupyter Notebook cell is pickled # > a class that is defined inside the __name__=='__main__' block is pickeled # handle Benchmark.evaluate not being called inside of __name__=='__main__' (this is handled by a big try/catch) def done_or_failed(results_or_exception=None): #This method is called one time at the completion of map_async #in the case that one of our jobs threw an exception the argument #will contain an exception otherwise it will be the returned results #of all the jobs. This method is executed on a thread in the Main context. if isinstance(results_or_exception, Exception): from coba.config import CobaConfig if "Can't pickle" in str( results_or_exception) or "Pickling" in str( results_or_exception): message = ( str(results_or_exception) + ". Coba attempted to process your Benchmark on multiple processes and " "the named class was not able to be pickled. This problem can be fixed in one of two ways: 1) " "evaluate the benchmark in question on a single process with no limit on the tasks per child or 2) " "modify the named class to be picklable. The easiest way to make the given class picklable is to " "add `def __reduce__ (self) return (<the class in question>, (<tuple of constructor arguments>))` to " "the class. For more information see https://docs.python.org/3/library/pickle.html#object.__reduce__." ) CobaConfig.Logger.log(message) else: CobaConfig.Logger.log_exception( results_or_exception) stdout_writer.write([None]) stdlog_writer.write([None]) log_thread = Thread(target=Pipe.join( stdlog_reader, [], CobaConfig.Logger.sink).run) log_thread.daemon = True log_thread.start() processor = MultiprocessFilter.Processor( self._filters, stdout_writer, stdlog_writer, self._processes) result = pool.map_async(processor.process, items, callback=done_or_failed, error_callback=done_or_failed, chunksize=1) # When items is empty finished_callback will not be called and we'll get stuck waiting for the poison pill. # When items is empty ready() will be true immediately and this check will place the poison pill into the queues. if result.ready(): done_or_failed() try: for item in stdout_reader.read(): yield item pool.close() except (KeyboardInterrupt, Exception): try: pool.terminate() except: pass raise finally: pool.join() log_thread.join() except RuntimeError as e: #This happens when importing main causes this code to run again raise CobaFatal(str(e))
def __init__(self, filters: Sequence[Filter], stdout: Sink, stdlog: Sink, n_proc: int) -> None: self._filter = Pipe.join(filters) self._stdout = stdout self._stdlog = stdlog self._n_proc = n_proc
def evaluate(self, learners: Sequence[Learner], result_file: str = None, seed: int = 1) -> Result: """Collect observations of a Learner playing the benchmark's simulations to calculate Results. Args: learners: The collection of learners that we'd like to evalute. result_file: The file we'd like to use for writing/restoring results for the requested evaluation. seed: The random seed we'd like to use when choosing which action to take from the learner's predictions. Returns: See the base class for more information. """ restored = Result.from_file(result_file) if result_file and Path( result_file).exists() else Result() n_given_learners = len(learners) n_given_simulations = len(self._simulations) if len(restored.benchmark) != 0: assert n_given_learners == restored.benchmark[ 'n_learners'], "The currently evaluating benchmark doesn't match the given transaction log" assert n_given_simulations == restored.benchmark[ 'n_simulations'], "The currently evaluating benchmark doesn't match the given transaction log" preamble = [] preamble.append(Transaction.version()) preamble.append( Transaction.benchmark(n_given_learners, n_given_simulations)) preamble.extend(Transaction.learners(learners)) preamble.extend(Transaction.simulations(self._simulations)) cb = self._chunk_by if self._chunk_by else CobaConfig.Benchmark[ 'chunk_by'] mp = self._processes if self._processes else CobaConfig.Benchmark[ 'processes'] mt = self._maxtasksperchild if self._maxtasksperchild_set else CobaConfig.Benchmark[ 'maxtasksperchild'] tasks = Tasks(self._simulations, learners, seed) unfinished = Unfinished(restored) chunked = ChunkByTask() if cb == 'task' else ChunkByNone( ) if cb == 'none' else ChunkBySource() process = Transactions() transaction_sink = TransactionSink(result_file, restored) if mp > 1 or mt is not None: process = MultiprocessFilter([process], mp, mt) #type: ignore try: Pipe.join(MemorySource(preamble), [], transaction_sink).run() Pipe.join(tasks, [unfinished, chunked, process], transaction_sink).run() except KeyboardInterrupt: CobaConfig.Logger.log( "Benchmark evaluation was manually aborted via Ctrl-C") except CobaFatal: raise except Exception as ex: CobaConfig.Logger.log_exception(ex) return transaction_sink.result
def test_exception(self): source = MemorySource(list(range(4))) sink = MemorySink() with self.assertRaises(Exception): Pipe.join(source, [Pipe_Tests.ExceptionFilter()], sink).run()