def make_dataflow_train(env): rng = random.gen_rng() def _outputs2action(outputs): epsilon = env.runtime['exp_epsilon'] return outputs['q_argmax'] if rng.rand() > epsilon else rng.choice( get_player_nr_actions()) collector = rl.train.SynchronizedExperienceCollector( env, make_player, _outputs2action, nr_workers=get_env('dqn.collector.nr_workers'), nr_predictors=get_env('dqn.collector.nr_workers'), predictor_output_names=get_env('dqn.collector.predictor_output_names'), mode=get_env('dqn.collector.mode')) return rl.train.QLearningDataFlow(collector, target=get_env('dqn.collector.target'), maxsize=get_env('dqn.expreplay.maxsize'), batch_size=get_env('trainer.batch_size'), epoch_size=get_env('trainer.epoch_size'), gamma=get_env('dqn.gamma'), nr_td_steps=get_env('dqn.nr_td_steps'), reward_cb=lambda r: np.clip(r, -1, 1))
def __init__(self, owner_env, scheduler, desc, nr_ensembles, devices, nr_epochs, epoch_size): self._owner_env = owner_env self._scheduler = scheduler self._schedule_logger = EveryNSecondLogger(logger, 2) self._desc = desc self._nr_ensembles = nr_ensembles self._devices = devices self._nr_epochs = nr_epochs self._epoch_size = epoch_size self._envs = [] self._funcs = [] self._funcs_lock = threading.Lock() self._dataflows = [] self._data_pool = [] self._data_pool_last = 0 # number of data points used for training last time step self._data_pool_lock = threading.Lock() self._data_pool_cond = threading.Condition(lock=self._data_pool_lock) self._training_sets = [] # List of list of data. self._validation_set = [] # List of data. self._waiting_for_data = threading.Event() self._rng = random.gen_rng()
def proc(): rng = tar.gen_rng() with fake_with_rng(rng): time.sleep(0.5) state = tar.get_rng().get_state() time.sleep(0.5) q.put(state)
def __init__(self, action_meanings=None): self.__rng = random.gen_rng() self._action_meanings = action_meanings
def __init__(self, env): self._env = env self.__rng = random.gen_rng() self.__initialized = False