def slave(): global env, test_env env = make_env(args=config_args, dream_env=config_args.dream_env) # doom env doesn't support mpi testing so don't bother loading if 'DoomTakeCover-v0' != config_args.env_name: test_env = make_env(args=config_args, dream_env=False, render_mode=False) packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32) while 1: comm.Recv(packet, source=0) assert(len(packet) == SOLUTION_PACKET_SIZE) solutions = decode_solution_packet(packet) results = [] for solution in solutions: worker_id, jobidx, seed, train_mode, max_len, weights = solution assert (train_mode == 1 or train_mode == 0 or train_mode == -1), str(train_mode) worker_id = int(worker_id) possible_error = "work_id = " + str(worker_id) + " rank = " + str(rank) assert worker_id == rank, possible_error jobidx = int(jobidx) seed = int(seed) fitness, timesteps = worker(weights, seed, train_mode, max_len) results.append([worker_id, jobidx, fitness, timesteps]) result_packet = encode_result_packet(results) assert len(result_packet) == RESULT_PACKET_SIZE comm.Send(result_packet, dest=0)
def slave(): global env if env_name == 'CarRacing-v0': env = make_env(args=config_args, dream_env=False, with_obs=True) # training in dreams not supported yet else: env = make_env(args=config_args, dream_env=True, render_mode=False) packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32) while 1: comm.Recv(packet, source=0) assert (len(packet) == SOLUTION_PACKET_SIZE) solutions = decode_solution_packet(packet) results = [] for solution in solutions: worker_id, jobidx, seed, train_mode, max_len, weights = solution assert (train_mode == 1 or train_mode == 0), str(train_mode) worker_id = int(worker_id) possible_error = "work_id = " + str(worker_id) + " rank = " + str( rank) assert worker_id == rank, possible_error jobidx = int(jobidx) seed = int(seed) fitness, timesteps = worker(weights, seed, train_mode, max_len) results.append([worker_id, jobidx, fitness, timesteps]) result_packet = encode_result_packet(results) assert len(result_packet) == RESULT_PACKET_SIZE comm.Send(result_packet, dest=0)
def main(): args = get_argparser().parse_args() init_logging('logs') env = make_env(args.env, args.seed, num_envs=args.num_envs, num_processes=args.num_processes) agent = ActorCritic(env.observation_space, env.action_space, args) train(agent, env, args, max_reward=args.max_reward) test_env = make_env(args.env, args.seed, num_envs=1, num_processes=1) make_fun(agent, test_env, render=True)
def initialize_settings(sigma_init=0.1, sigma_decay=0.9999): global population, filebase, game, controller, env, num_params, es, PRECISION, SOLUTION_PACKET_SIZE, RESULT_PACKET_SIZE population = num_worker * num_worker_trial filebase = 'log/' + gamename + '.' + optimizer + '.' + str( num_episode) + '.' + str(population) controller = make_model() env = make_env() num_params = controller.param_count print("size of model", num_params) if optimizer == 'ses': ses = PEPG(num_params, sigma_init=sigma_init, sigma_decay=sigma_decay, sigma_alpha=0.2, sigma_limit=0.02, elite_ratio=0.1, weight_decay=0.005, popsize=population) es = ses elif optimizer == 'ga': ga = SimpleGA(num_params, sigma_init=sigma_init, sigma_decay=sigma_decay, sigma_limit=0.02, elite_ratio=0.1, weight_decay=0.005, popsize=population) es = ga elif optimizer == 'cma': cma = CMAES(num_params, sigma_init=sigma_init, popsize=population) es = cma elif optimizer == 'pepg': pepg = PEPG(num_params, sigma_init=sigma_init, sigma_decay=sigma_decay, sigma_alpha=0.20, sigma_limit=0.02, learning_rate=0.01, learning_rate_decay=1.0, learning_rate_limit=0.01, weight_decay=0.005, popsize=population) es = pepg else: oes = OpenES(num_params, sigma_init=sigma_init, sigma_decay=sigma_decay, sigma_limit=0.02, learning_rate=0.01, learning_rate_decay=1.0, learning_rate_limit=0.01, antithetic=antithetic, weight_decay=0.005, popsize=population) es = oes PRECISION = 10000 SOLUTION_PACKET_SIZE = (5 + num_params) * num_worker_trial RESULT_PACKET_SIZE = 4 * num_worker_trial
def make_env(self, seed=-1, render_mode=False, load_model=True, lock=None): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode, load_model=load_model, lock=lock)
def make_env(self, seed=-1, render_mode=False): self.render_mode = render_mode self.env = make_env(self.env_name, self.encoder, self.max_features, seed=seed, render_mode=render_mode)
def __init__(self, **kwarg): """ Args: kwarg: configurations for the environment. """ config = get_default_config() name = kwarg['name'] for key, value in kwarg.items(): if hasattr(config, key): setattr(config, key, value) # create an environment self.env = make_env(name, config) # covert observation space obs_space = self.env.observation_space obs_size = sum([np.prod(v) for v in obs_space.values()]) low = -1 * np.ones(obs_size) high = np.ones(obs_size) self.observation_space = gym.spaces.Box(low=low, high=high) # covert action space dof = self.env.dof low = -1 * np.ones(dof) high = np.ones(dof) self.action_space = gym.spaces.Box(low=low, high=high)
def run(args): # env = make_env(args.env_id) env = gym.make(env_id) env_test = make_env(args.env_id) buffer_exp = SerializedBuffer( path=args.buffer, device=torch.device("cuda" if args.cuda else "cpu")) algo = AIRL(buffer_exp=buffer_exp, state_shape=env.observation_space.shape, action_shape=env.action_space.shape, device=torch.device("cuda" if args.cuda else "cpu"), seed=args.seed, rollout_length=args.rollout_length) time = datetime.now().strftime("%Y%m%d-%H%M") log_dir = os.path.join('logs', args.env_id, args.algo, f'seed{args.seed}-{time}') trainer = Trainer(env=env, env_test=env_test, algo=algo, log_dir=log_dir, num_steps=args.num_steps, eval_interval=args.eval_interval, seed=args.seed) trainer.train()
def make_env(self, env_name, seed=-1, render_mode=False, model=None): self.render_mode = render_mode self.env_name = env_name self.env = make_env(env_name, seed=seed, render_mode=render_mode, model=model)
def __init__(self, env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm): self.env_id = env_id self.env = make_env(env_id, n_envs=4) self.num_envs = self.env.num_envs if isinstance(self.env, VecEnv) else 1 self.state_dim = self.env.observation_space.shape[0] self.action_converter = ActionConverter(self.env.action_space) self.lr = lr self.nstep = nstep self.batch_size = batch_size self.n_epochs = n_epochs self.gamma = gamma self.gae_lam = gae_lam self.clip_range = clip_range self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.ep_info_buffer = deque(maxlen=50) self._n_updates = 0 self.num_timesteps = 0 self.num_episodes = 0 self.obs_rms = RunningMeanStd()
def make_env(self, seed=-1, render_mode=False, full_episode=False, worker_id=0): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode, full_episode=full_episode, worker_id=worker_id)
def main(args): env_name = args.env_name total_episodes = args.total_episodes start_batch = args.start_batch time_steps = args.time_steps obs_data = [] action_data = [] env = make_env(env_name) s = 0 batch = start_batch while s < total_episodes: for i_episode in range(200): print('-----') observation = env.reset() env.render() done = False action = env.action_space.sample() t = 0 obs_sequence = [] action_sequence = [] while t < time_steps: t = t + 1 action = config.generate_data_action(t, action) observation = config.adjust_obs(observation) obs_sequence.append(observation) action_sequence.append(action) observation, reward, done, info = env.step(action) obs_data.append(obs_sequence) action_data.append(action_sequence) print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t+1)) print("Current dataset contains {} observations".format(sum(map(len, obs_data)))) s = s + 1 print("Saving dataset for batch {}".format(batch)) np.save('./data/obs_data_' + str(batch), obs_data) np.save('./data/action_data_' + str(batch), action_data) batch = batch + 1 obs_data = [] action_data = []
def test_render(): env = make_env(10000, visualize=True) obs = env.reset() #while True: # env.render() zero_action = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) #start_time = time.time() new_obs, rew, done, _ = env.step(zero_action) action = np.array([[0.0, 0.0, 4.0], [0.0, 0.0, 0.0]]) while True: new_obs, rew, done, _ = env.step(action) env.render() if done: break
def init_gym(env_name): """ Initialize gym environment, return dimension of observation and action spaces. Args: env_name: str environment name (e.g. "Humanoid-v1") Returns: 3-tuple gym environment (object) number of observation dimensions (int) number of action dimensions (int) """ #env = gym.make(env_name) env = make_env(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] return env, obs_dim, act_dim
def __init__(self, type="CarRacing", history_pick=4, seed=None, detect_edges=False, detect_grass=False, flip=False): self.name = type + str(time.time()) random.seed(30) self.env = make_env('CarRacing-v0', random.randint(1,10000000), render_mode = False, full_episode = True) self.image_dimension = [64,64] self.history_pick = history_pick self.state_space_size = history_pick * np.prod(self.image_dimension) self.action_space_size = 5 self.state_shape = [None, self.history_pick] + list(self.image_dimension) self.history = [] self.action_dict = {0: [-1, 0, 0], 1: [1, 0, 0], 2: [0, 1, 0], 3: [0, 0, 0.8], 4: [0, 0, 0]} self.seed = seed self.detect_edges = detect_edges self.detect_grass = detect_grass self.flip = flip self.flip_episode = False self.vae = ConvVAE(batch_size=1, gpu_mode=False, is_training=False, reuse=True) self.rnn = MDNRNN(hps_sample, gpu_mode=False, reuse=True) self.vae.load_json('vae/vae.json') self.rnn.load_json('rnn/rnn.json')
def test(): env = make_env() obs = env.reset() # Test zero action zero_action = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) print(f"Before zero action = {obs}") new_obs, rew, done, _ = env.step(zero_action) print(f"After zero action = {new_obs}, is equal = {obs == new_obs}") obs = new_obs # Test dynamics dt = 0.002 action = np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]) new_pos = deepcopy(obs) new_pos[0] = obs[0] + (1000 * action[0, 0] * np.cos(obs[2]) + 1000 * action[0, 1] * np.sin(obs[2])) * dt new_pos[1] = obs[1] + (1000 * action[0, 1] * np.cos(obs[2]) - 1000 * action[0, 0] * np.sin(obs[2])) * dt new_pos[2] = obs[2] + action[0, 2] * dt new_obs, rew, done, _ = env.step(action) print(f"Is done = {done}, Is dynamics correct = {new_pos == new_obs}")
def main(_): display = Display(visible=0, size=(1400, 900)) #display.start() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session() as sess: global_step = tf.Variable(0, name='global_step', trainable=False) env = make_env(ENV_NAME, 876, render_mode=False, full_episode=True) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) state_dim = [32] action_dim = env.action_space.shape[0] action_bound = env.action_space.high print('state_dim: ', state_dim) print('action_dim: ', action_dim) print('action_bound: ', action_bound) # Ensure action bound is symmetric # assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, TAU) critic = CriticNetwork(sess, state_dim, action_dim, TAU, actor.get_num_trainable_vars()) if GYM_MONITOR_EN: if not RENDER_ENV: env.monitor.start(MONITOR_DIR, video_callable=False, force=True) else: env.monitor.start(MONITOR_DIR, force=True) train(sess, env, actor, critic, global_step) if GYM_MONITOR_EN: env.monitor.close()
def slave(): env = make_env() packet = np.empty(SOLUTION_PACKET_SIZE, dtype=np.int32) while 1: comm.Recv(packet, source=0) assert (len(packet) == SOLUTION_PACKET_SIZE) solutions = decode_solution_packet(packet) results = [] for solution in solutions: worker_id, jobidx, seed, train_mode, max_len, weights = solution assert (train_mode == 1 or train_mode == 0), str(train_mode) worker_id = int(worker_id) possible_error = "work_id = " + str(worker_id) + " rank = " + str( rank) assert worker_id == rank, possible_error jobidx = int(jobidx) seed = int(seed) fitness, timesteps = worker(weights, seed, train_mode, max_len) results.append([worker_id, jobidx, fitness, timesteps]) result_packet = encode_result_packet(results) assert len(result_packet) == RESULT_PACKET_SIZE comm.Send(result_packet, dest=0)
def main(args): exp_path = mkdir_exp(f'{args.env_id}_PPO') export_args(args, os.path.join(exp_path, 'config.json')) np.random.seed(args.seed) pt.random.manual_seed(args.seed) print("== Creating a training environment...") env = make_env(args.env_id, NormalizeObservation, num_envs=args.num_envs) print("== Creating a evaluation environment...") eval_env = make_env(args.env_id, NormalizeObservation, num_envs=1) obs_dim = eval_env.observation_space.shape[0] act_dim = eval_env.action_space.shape[0] print("== Creating an agent....") device = pt.device('cuda' if pt.cuda.is_available() else 'cpu') agent = ContinuousPolicyAgent(obs_dim, act_dim, args.hid_dim).to(device) print("== Creating a data storage...") data = TensorBook(args.env_id, args.rollout_steps) print("== Creating a PPO optimizer...") optimizer = ProximalPolicyOptimization( agent, device, num_epochs=args.num_epochs, batch_size=args.batch_size, lr_max=args.lr_max, lr_min=args.lr_min, eps=args.eps, gamma=args.gamma, lam=args.lam, alpha=args.alpha, value_coef=args.value_coef, entropy_coef=args.entropy_coef, max_grad_norm=args.max_grad_norm, target_kldiv=args.target_kldiv ) print("== Creating a TensorBoard summary writer...") writer = SummaryWriter(log_dir=exp_path) print("IT'S DANGEROUS TO GO ALONE! TAKE THIS.") obs = env.reset().to(device) best_perf = -np.inf num_updates = args.num_steps // args.rollout_steps // args.num_envs for i in tqdm(range(num_updates)): obs = agent.rollout(obs, env, data) info = optimizer.update(data) lr = optimizer.update_lr(i, num_updates) # Compute mean total reward during the rollout. reward = data.reward.sum(dim=0).mean(dim=0).item() # Evaluate the agent. perf = play(eval_env, agent, device, repeat=args.num_eval) if perf > best_perf: model_path = os.path.join(exp_path, f'{agent.__class__.__name__}.pt') pt.save(agent.state_dict(), model_path) best_perf = perf # Log training progress. step = i * args.rollout_steps * args.num_envs writer.add_scalar('Train/lr', lr, step) writer.add_scalar('Train/epochs', info['num_epochs'], step) writer.add_scalar('Train/loss/policy', info['policy_loss'], step) writer.add_scalar('Train/loss/value', info['value_loss'], step) writer.add_scalar('Train/loss/entropy', info['entropy'], step) writer.add_scalar('Train/loss/total', info['total_loss'], step) writer.add_scalar('Train/reward/mean', reward, step) writer.add_scalar('Eval/reward/mean', perf, step) writer.add_scalar('Eval/reward/best', best_perf, step) env.close() eval_env.close() writer.close()
def master(): global test_env if env_name == 'CarRacing-v0': test_env = make_env(args=config_args, dream_env=False, with_obs=True) else: test_env = make_env(args=config_args, dream_env=False, render_mode=False) start_time = int(time.time()) sprint("training", env_name) sprint("population", es.popsize) sprint("num_worker", num_worker) sprint("num_worker_trial", num_worker_trial) sys.stdout.flush() seeder = Seeder(seed_start) filename = filebase + '.json' filename_log = filebase + '.log.json' filename_hist = filebase + '.hist.json' filename_eval_hist = filebase + '.eval_hist.json' filename_hist_best = filebase + '.hist_best.json' filename_best = filebase + '.best.json' t = 0 history = [] history_best = [] # stores evaluation averages every 25 steps or so eval_log = [] eval_hist = [] best_reward_eval = 0 best_model_params_eval = None max_len = -1 # max time steps (-1 means ignore) while True: solutions = es.ask() if antithetic: seeds = seeder.next_batch(int(es.popsize / 2)) seeds = seeds + seeds else: seeds = seeder.next_batch(es.popsize) packet_list = encode_solution_packets(seeds, solutions, max_len=max_len) send_packets_to_slaves(packet_list) reward_list_total = receive_packets_from_slaves() reward_list = reward_list_total[:, 0] # get rewards mean_time_step = int(np.mean(reward_list_total[:, 1]) * 100) / 100. # get average time step max_time_step = int(np.max(reward_list_total[:, 1]) * 100) / 100. # get average time step avg_reward = int( np.mean(reward_list) * 100) / 100. # get average time step std_reward = int( np.std(reward_list) * 100) / 100. # get average time step es.tell(reward_list) es_solution = es.result() model_params = es_solution[0] # best historical solution reward = es_solution[1] # best reward curr_reward = es_solution[2] # best of the current batch controller.set_model_params(np.array(model_params).round(4)) r_max = int(np.max(reward_list) * 100) / 100. r_min = int(np.min(reward_list) * 100) / 100. curr_time = int(time.time()) - start_time h = (t, curr_time, avg_reward, r_min, r_max, std_reward, int(es.rms_stdev() * 100000) / 100000., mean_time_step + 1., int(max_time_step) + 1) if cap_time_mode: max_len = 2 * int(mean_time_step + 1.0) else: max_len = -1 history.append(h) with open(filename, 'wt') as out: res = json.dump([np.array(es.current_param()).round(4).tolist()], out, sort_keys=True, indent=2, separators=(',', ': ')) with open(filename_hist, 'wt') as out: res = json.dump(history, out, sort_keys=False, indent=0, separators=(',', ':')) sprint(env_name, h) if (t == 1): best_reward_eval = avg_reward if (t % eval_steps == 0): # evaluate on actual task at hand prev_best_reward_eval = best_reward_eval model_params_quantized = np.array(es.current_param()).round(4) reward_eval_list = evaluate_batch(model_params_quantized, max_len=-1, test_seed=t) reward_eval = np.mean(reward_eval_list) r_eval_std = np.std(reward_eval_list) r_eval_min = np.min(reward_eval_list) r_eval_max = np.max(reward_eval_list) model_params_quantized = model_params_quantized.tolist() improvement = reward_eval - best_reward_eval eval_log.append([t, reward_eval, model_params_quantized]) e_h = (t, reward_eval, r_eval_std, r_eval_min, r_eval_max) eval_hist.append(e_h) with open(filename_eval_hist, 'wt') as out: res = json.dump(eval_hist, out, sort_keys=False, indent=0, separators=(',', ':')) with open(filename_log, 'wt') as out: res = json.dump(eval_log, out) if (len(eval_log) == 1 or reward_eval > best_reward_eval): best_reward_eval = reward_eval best_model_params_eval = model_params_quantized else: if retrain_mode: sprint( "reset to previous best params, where best_reward_eval =", best_reward_eval) es.set_mu(best_model_params_eval) with open(filename_best, 'wt') as out: res = json.dump([best_model_params_eval, best_reward_eval], out, sort_keys=True, indent=0, separators=(',', ': ')) # dump history of best curr_time = int(time.time()) - start_time best_record = [ t, curr_time, "improvement", improvement, "curr", reward_eval, "prev", prev_best_reward_eval, "best", best_reward_eval ] history_best.append(best_record) with open(filename_hist_best, 'wt') as out: res = json.dump(history_best, out, sort_keys=False, indent=0, separators=(',', ':')) sprint("Eval", t, curr_time, "improvement", improvement, "curr", reward_eval, "prev", prev_best_reward_eval, "best", best_reward_eval) # increment generation t += 1
global_env = make_env( [ ("[]", (TYPE_NIL,)), ("true", (TYPE_BOOL, True)), ("false", (TYPE_BOOL, False)), ("cons", (TYPE_BUILTIN_FUNCTION, tagged_cons)), ("head", (TYPE_BUILTIN_FUNCTION, tagged_head)), ("tail", (TYPE_BUILTIN_FUNCTION, tagged_tail)), ("+", num_op(lambda x, y: x + y)), ("*", num_op(lambda x, y: x * y)), ("-", num_op(lambda x, y: x - y)), ("/", num_op(lambda x, y: x // y)), ("or", bool_op(lambda x, y: x or y)), ("and", bool_op(lambda x, y: x and y)), ("xor", bool_op(lambda x, y: x ^ y)), ("not", (TYPE_BUILTIN_FUNCTION, neg)), ("<", comp_op(lambda x, y: x < y)), (">", comp_op(lambda x, y: x > y)), ("<=", comp_op(lambda x, y: x <= y)), (">=", comp_op(lambda x, y: x >= y)), ("=", (TYPE_BUILTIN_FUNCTION, equal)), ("!=", (TYPE_BUILTIN_FUNCTION, unequal)), ("int_of_string", (TYPE_BUILTIN_FUNCTION, int_of_string)), ("string_of_int", (TYPE_BUILTIN_FUNCTION, string_of_int)), ("int_of_char", (TYPE_BUILTIN_FUNCTION, int_of_char)), ("char_of_int", (TYPE_BUILTIN_FUNCTION, char_of_int)), ("error", (TYPE_BUILTIN_FUNCTION, error)), ("concat", (TYPE_BUILTIN_FUNCTION, concat)), ] )
def make_env(self, seed=-1, render_mode=False): self.render_mode = render_mode self.env = make_env(self.env_name, seed=seed, render_mode=render_mode)
def main(args): """ Inputs type of agent, observation types and simulates the environment. """ print("The observation tutorial will show you the various observation configurations available.") background_name = background_names[1] # load demo file for playback demo = args.load_demo = input('Input path to demo file, such as demos/Sawyer_7.pkl: ') if demo == '': demo = args.load_demo = 'demos/Sawyer_7.pkl' agent_name, furniture_id = demo.split('/')[-1].split('.')[0].split('_') agent_name = agent_name[0].upper() + agent_name[1:] furniture_id = int(furniture_id) furniture_name = furniture_names[furniture_id] # choose robot observation print() print("Include robot observation?\n") try: s = input("Put 1 for True or 0 for False: ") k = int(s) == 1 except: print("Input is not valid. Use 0 by default.") k = False args.robot_ob = k # choose furniture observation print() print("Include furniture observation?\n") try: s = input("Put 1 for True or 0 for False: ") k = int(s) == 1 except: print("Input is not valid. Use 0 by default.") k = False args.object_ob = k # choose segmentation print() print("Use segmentation?\n") try: s = input("Put 1 for True or 0 for False: ") k = int(s) == 1 except: print("Input is not valid. Use 0 by default.") k = False use_seg = k # choose depth print() print("Use depth map?\n") try: s = input("Put 1 for True or 0 for False: ") k = int(s) == 1 except: print("Input is not valid. Use 0 by default.") k = False use_depth = k # set parameters for the environment (env, furniture_id, background) env_name = 'Furniture{}Env'.format(agent_name) args.env = env_name args.furniture_id = furniture_id args.background = background_name print() print("Creating environment (robot: {}, furniture: {}, background: {})".format( env_name, furniture_name, background_name)) # make environment with rgb, depth map, and segmentation args.depth_ob = True args.segmentation_ob = True # make environment following arguments env = make_env(env_name, args) ob = env.reset(args.furniture_id, args.background) # tell user about environment observation space print('-' * 80) print('Observation configuration:') print(f"Robot ob: {args.robot_ob}, Furniture ob: {args.object_ob}") print(f"Depth Map: {use_depth}, Segmentation Map: {use_seg}") print() print("Observation Space:\n") print("The observation space is a dictionary. For furniture (object) observations, it is "+ "a multiple of 7 because each part has 3 dims for position and 4 dims for quaternion. "+ "The robot_ob is dependent on the agent, and contains position, velocity, or angles of "+ "the current robot.\n") print(env.observation_space) print() input("Type anything to record an episode's visual observations") # run the trajectory, save the video rgb_frames = [] depth_frames = [] seg_frames = [] # load demo from pickle file with open(env._load_demo, 'rb') as f: demo = pickle.load(f) all_qpos = demo['qpos'] # playback first 100 frames for qpos in all_qpos: # set furniture part positions for i, body in enumerate(env._object_names): pos = qpos[body][:3] quat = qpos[body][3:] env._set_qpos(body, pos, quat) env._stop_object(body, gravity=0) # set robot positions if env._agent_type == 'Sawyer': env.sim.data.qpos[env._ref_joint_pos_indexes] = qpos['sawyer_qpos'] env.sim.data.qpos[env._ref_gripper_joint_pos_indexes] = qpos['l_gripper'] elif env._agent_type == 'Baxter': env.sim.data.qpos[env._ref_joint_pos_indexes] = qpos['baxter_qpos'] env.sim.data.qpos[env._ref_gripper_right_joint_pos_indexes] = qpos['r_gripper'] env.sim.data.qpos[env._ref_gripper_left_joint_pos_indexes] = qpos['l_gripper'] elif env._agent_type == 'Cursor': env._set_pos('cursor0', qpos['cursor0']) env._set_pos('cursor1', qpos['cursor1']) env.sim.forward() env._update_unity() img, depth = env.render('rgbd_array') seg = I.color_segmentation(env.render('segmentation')) rgb_frames.append(img) depth_frames.append(depth) seg_frames.append(seg) env.close() # concatenate available observation frames together and render video wide_frames = [] L = max(len(rgb_frames), len(rgb_frames), len(seg_frames)) for l in range(L): rgb = rgb_frames[l] f = [rgb * 255] if use_depth: depth = depth_frames[l] f.append(depth * 255) if use_seg: seg = seg_frames[l] f.append(seg) wide = np.concatenate(f, axis=1) wide_frames.append(wide) vr = VideoRecorder() vr._frames = wide_frames vr.save_video('observations.mp4')
def main(args): print("main") env_name = args.env_name total_episodes = args.total_episodes start_batch = args.start_batch time_steps = args.time_steps render = args.render batch_size = args.batch_size run_all_envs = args.run_all_envs store_folder = args.store_folder if not os.path.exists(store_folder): os.makedirs(store_folder) if run_all_envs: envs_to_generate = config.train_envs else: envs_to_generate = [env_name] print("envs:", envs_to_generate) for current_env_name in envs_to_generate: print("Generating data for env {}".format(current_env_name)) env = make_env(current_env_name) s = 0 batch = start_batch batch_size = min(batch_size, total_episodes) total_frames = 0 while s < total_episodes: obs_data = [] action_data = [] for i_episode in range(batch_size): print('-----') observation = env._reset() #observation = config.adjust_obs(observation) # plt.imshow(observation) # plt.show() env.render() done = False action = np.random.rand() *2.0 -1.0 t = 0 obs_sequence = [] action_sequence = [] repeat = np.random.randint(1, 11) while t < time_steps: # and not done: t = t + 1 if t % repeat == 0: action = np.random.rand() * 2.0 - 1.0 repeat = np.random.randint(1, 11) obs_sequence.append(observation) action_sequence.append(action) observation, reward, done, info = env._step(action) if render: env.render() if done: #If we were killed break total_frames += t print("dead at", t, "total recorded frames for this worker", total_frames) obs_data.append(obs_sequence) action_data.append(action_sequence) print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t + 1)) print("Current dataset contains {} observations".format(sum(map(len, obs_data)))) s = s + 1 print("Saving dataset for batch {}".format(batch)) np.save(store_folder+'/obs_data_' + current_env_name + '_' + str(batch), obs_data) print("Saving actions for batch {}".format(batch)) np.save(store_folder+'/action_data_' + current_env_name + '_' + str(batch), action_data) batch = batch + 1 env.close()
def make_env(self, env_name, seed=-1, render_mode=False): self.render_mode = render_mode self.env_name = env_name self.env = make_env(env_name, seed=seed, render_mode=render_mode)
def _make_env(self): self.render_mode = render_mode self.env = make_env(self.env_name) self.num_actions = self.env.action_space.n
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_critic = core.get_ddpg_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs actions = [] epoch_actions = [] rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 epoch_actions.append(pi_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) actions.append(np.mean(epoch_actions)) epoch_actions = [] rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True) test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('QVals', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep util.plot_actions(test_actions, act_high, logger.output_dir + '/actions%s.png' % epoch) logger.save_state( { "actions": actions, "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
exp_name = "control" seed = 1007 env_config = '1x1_mix_ms_una.json' iterations = 1 max_ep_len = 600 wp = 0 wi = 0.1 wd = 0.1 initial_bound = 1.1 final_bound = 1.01 bound_decay = 0.98 env = make_env(util.ENV_CONFIG_DIR + env_config) obs = [] actions = [] action_sign = np.array([-1, -1]) for i in range(iterations): current_bound = initial_bound o = env.reset() real_action = env.action_space.default() * 0.5 for t in range(max_ep_len): o, r, d, _ = env.step(real_action) obs.append(o) actions.append(real_action) vp = o vi = np.mean(obs[-5:])
model.make_env(render_mode=render_mode) model.load_model(filename) else: model = make_model(load_model=False) print('model size', model.param_count) model.make_env(render_mode=render_mode) model.init_random_model_params(stdev=np.random.rand()*0.01) N_episode = 100 if render_mode: N_episode = 1 reward_list = [] for i in range(N_episode): reward, steps_taken = simulate(model, train_mode=False, render_mode=render_mode, num_episode=1) if render_mode: print("terminal reward", reward, "average steps taken", np.mean(steps_taken)+1) else: print(reward[0]) reward_list.append(reward[0]) if not render_mode: print("seed", the_seed, "average_reward", np.mean(reward_list), "stdev", np.std(reward_list)) if __name__ == "__main__": import env e = env.make_env() c = Controller() import pdb; pdb.set_trace() r, t = simulate(c, e, render_mode=False) #main()
def learn(sess, n_tasks, z_size, data_dir, num_steps, max_seq_len, batch_size_per_task=16, rnn_size=256, grad_clip=1.0, v_lr=0.0001, vr_lr=0.0001, min_v_lr=0.00001, v_decay=0.999, kl_tolerance=0.5, lr=0.001, min_lr=0.00001, decay=0.999, view="transposed", model_dir="tf_rnn", layer_norm=False, rnn_mmd=False, no_cor=False, w_mmd=1.0, alpha=1.0, beta=0.1, recurrent_dp=1.0, input_dp=1.0, output_dp=1.0): batch_size = batch_size_per_task * n_tasks wrapper = WrapperFactory.get_wrapper(view) if wrapper is None: raise Exception("Such view is not available") print("Batch size for each taks is", batch_size_per_task) print("The total batch size is", batch_size) check_dir(model_dir) lf = open(model_dir + '/log_%s' % datetime.now().isoformat(), "w") # define env na = make_env(config.env_name).action_space.n input_size = z_size + na output_size = z_size print("the environment", config.env_name, "has %i actions" % na) seq_len = max_seq_len fns = os.listdir(data_dir) fns = [fn for fn in fns if '.npz' in fn] random.shuffle(fns) dm = get_dm(wrapper, seq_len, na, data_dir, fns, not no_cor) tf_vrct_lr = tf.placeholder(tf.float32, shape=[]) # learn from reconstruction. vaes, vcomps = build_vaes(n_tasks, na, z_size, seq_len, tf_vrct_lr, kl_tolerance) vae_losses = [vcomp.loss for vcomp in vcomps] transform_loss = get_transform_loss(vcomps[0], vaes[1], wrapper) old_vae0 = ConvVAE(name="old_vae0", z_size=z_size) old_vcomp0 = build_vae("old_vae0", old_vae0, na, z_size, seq_len, tf_vrct_lr, kl_tolerance) assign_old_eq_new = tf.group([tf.assign(oldv, newv) for (oldv, newv) in zip(old_vcomp0.var_list, vcomps[0].var_list)]) vmmd_losses = get_vmmd_losses(n_tasks, old_vcomp0, vcomps, alpha, beta) vrec_ops = get_vae_rec_ops(n_tasks, vcomps, vmmd_losses, w_mmd) vrec_all_op = tf.group(vrec_ops) # Meta RNN. rnn = VRNN("rnn", max_seq_len, input_size, output_size, batch_size_per_task, rnn_size, layer_norm, recurrent_dp, input_dp, output_dp) global_step = tf.Variable(0, name='global_step', trainable=False) tf_rpred_lr = tf.placeholder(tf.float32, shape=[]) rcomp0 = build_rnn("rnn", rnn, na, z_size, batch_size_per_task, seq_len) print("The basic rnn has been built") rcomps = build_rnns(n_tasks, rnn, vaes, vcomps, kl_tolerance) rnn_losses = [rcomp.loss for rcomp in rcomps] if rnn_mmd: rmmd_losses = get_rmmd_losses(n_tasks, old_vcomp0, vcomps, alpha, beta) for i in range(n_tasks): rnn_losses[i] += 0.1 * rmmd_losses[i] ptransform_loss = get_predicted_transform_loss(vcomps[0], rcomps[0], vaes[1], wrapper, batch_size_per_task, seq_len) print("RNN has been connected to each VAE") rnn_total_loss = tf.reduce_mean(rnn_losses) rpred_opt = tf.train.AdamOptimizer(tf_rpred_lr, name="rpred_opt") gvs = rpred_opt.compute_gradients(rnn_total_loss, rcomp0.var_list) clip_gvs = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in gvs if grad is not None] rpred_op = rpred_opt.apply_gradients(clip_gvs, global_step=global_step, name='rpred_op') # VAE in prediction phase vpred_ops, tf_vpred_lrs = get_vae_pred_ops(n_tasks, vcomps, rnn_losses) vpred_all_op = tf.group(vpred_ops) rpred_lr = lr vrct_lr = v_lr vpred_lr = vr_lr sess.run(tf.global_variables_initializer()) for i in range(num_steps): step = sess.run(global_step) rpred_lr = (rpred_lr - min_lr) * decay + min_lr vrct_lr = (vrct_lr - min_v_lr) * v_decay + min_v_lr vpred_lr = (vpred_lr - min_v_lr) * v_decay + min_v_lr ratio = 1.0 data_buffer = [] for it in range(config.psteps_per_it): raw_obs_list, raw_a_list = dm.random_batch(batch_size_per_task) data_buffer.append((raw_obs_list, raw_a_list)) feed = {tf_rpred_lr: rpred_lr, tf_vrct_lr: vrct_lr, tf_vpred_lrs[0]: vpred_lr, tf_vpred_lrs[1]: vpred_lr * ratio} feed[old_vcomp0.x] = raw_obs_list[0] for j in range(n_tasks): vcomp = vcomps[j] feed[vcomp.x] = raw_obs_list[j] feed[vcomp.a] = raw_a_list[j][:, :-1, :] (rnn_cost, rnn_cost2, vae_cost, vae_cost2, transform_cost, ptransform_cost, _, _) = sess.run( [rnn_losses[0], rnn_losses[1], vae_losses[0], vae_losses[1], transform_loss, ptransform_loss, rpred_op, vpred_all_op], feed) ratio = rnn_cost2 / rnn_cost if i % config.log_interval == 0: output_log = get_output_log(step, rpred_lr, [vae_cost], [rnn_cost], [transform_cost], [ptransform_cost]) lf.write(output_log) data_order = np.arange(len(data_buffer)) nd = len(data_order) np.random.shuffle(data_order) for it in range(config.rsteps_per_it): if (it + 1) % nd == 0: np.random.shuffle(data_order) rid = data_order[it % nd] raw_obs_list, raw_a_list = data_buffer[rid] # raw_obs_list, raw_a_list = dm.random_batch(batch_size_per_task) feed = {tf_rpred_lr: rpred_lr, tf_vrct_lr: vrct_lr} feed[old_vcomp0.x] = raw_obs_list[0] for j in range(n_tasks): vcomp = vcomps[j] feed[vcomp.x] = raw_obs_list[j] feed[vcomp.a] = raw_a_list[j][:, :-1, :] (rnn_cost, rnn_cost2, vae_cost, vae_cost2, transform_cost, ptransform_cost, _) = sess.run([ rnn_losses[0], rnn_losses[1], vae_losses[0], vae_losses[1], transform_loss, ptransform_loss, vrec_all_op], feed) if i % config.log_interval == 0: output_log = get_output_log(step, rpred_lr, [vae_cost], [rnn_cost], [transform_cost], [ptransform_cost]) lf.write(output_log) lf.flush() if (i + 1) % config.target_update_interval == 0: sess.run(assign_old_eq_new) if i % config.model_save_interval == 0: tmp_dir = model_dir + '/it_%i' % i check_dir(tmp_dir) saveToFlat(rcomp0.var_list, tmp_dir + '/rnn.p') for j in range(n_tasks): vcomp = vcomps[j] saveToFlat(vcomp.var_list, tmp_dir + '/vae%i.p' % j) saveToFlat(rcomp0.var_list, model_dir + '/final_rnn.p') for i in range(n_tasks): vcomp = vcomps[i] saveToFlat(vcomp.var_list, model_dir + '/final_vae%i.p' % i)
def actor(): print(f"STARTING ACTOR with rank {rank}") sys.stdout.flush() # GAE hyper-parameters lam = 0.95 gamma = 0.99 # Build network architecture nav = Navigation(1, training=False) nav.call_build() # Get agent type agent_type = np.where(np.array(actors) == rank)[0][0] # Setup environment env = make_env() obs = env.reset() dones = False while True: weights = comm.recv(source=learners[agent_type]) nav.set_weights(weights) mb_rewards = np.zeros([nsteps, 1], dtype=np.float32) mb_values = np.zeros([nsteps, 1], dtype=np.float32) mb_neglogpacs = np.zeros([nsteps, 1], dtype=np.float32) mb_dones = np.zeros([nsteps, 1], dtype=np.float32) mb_obs = np.zeros([nsteps, 16], dtype=np.float32) mb_actions = { 'x1': np.zeros([nsteps, 1], dtype=np.int32), 'x2': np.zeros([nsteps, 1], dtype=np.int32), 'y1': np.zeros([nsteps, 1], dtype=np.int32), 'y2': np.zeros([nsteps, 1], dtype=np.int32), 'w1': np.zeros([nsteps, 1], dtype=np.int32), 'w2': np.zeros([nsteps, 1], dtype=np.int32) } mb_logits = { 'x1': np.zeros([nsteps, 21], dtype=np.float32), 'x2': np.zeros([nsteps, 21], dtype=np.float32), 'y1': np.zeros([nsteps, 21], dtype=np.float32), 'y2': np.zeros([nsteps, 21], dtype=np.float32), 'w1': np.zeros([nsteps, 21], dtype=np.float32), 'w2': np.zeros([nsteps, 21], dtype=np.float32) } for i in range(nsteps): # Get actions of training agent actions, neglogp, entropy, value, logits = nav( np.expand_dims(obs, axis=0)) mb_values[i] = value mb_neglogpacs[i] = neglogp mb_obs[i] = obs for k in actions.keys(): mb_actions[k][i] = actions[k] mb_logits[k][i] = logits[k] mb_dones[i] = dones # Take actions in env and look at the results actions = {k: (v[0] - 10) / 10 for k, v in actions.items()} agent_actions = np.array( [[actions['x1'], actions['y1'], actions['w1']], [actions['x2'], actions['y2'], actions['w2']]]) obs, rewards, dones, infos = env.step(agent_actions) # Handle rewards mb_rewards[i] = rewards if dones: obs = env.reset() # get last value for bootstrap _, _, _, last_values, _ = nav(np.expand_dims(obs, axis=0)) # discount/bootstrap off value fn mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 # perform GAE calculation for t in reversed(range(nsteps)): if t == nsteps - 1: nextnonterminal = 1.0 - dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + gamma * nextvalues * nextnonterminal - mb_values[t] mb_advs[ t] = lastgaelam = delta + gamma * lam * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values # Send trajectory to learner mb_values = np.squeeze(mb_values, axis=-1) mb_rewards = np.squeeze(mb_rewards, axis=-1) mb_neglogpacs = np.squeeze(mb_neglogpacs, axis=-1) mb_returns = np.squeeze(mb_returns, axis=-1) mb_dones = np.squeeze(mb_dones, axis=-1) trajectory = { 'mb_obs': mb_obs, 'mb_actions': mb_actions, 'mb_logits': mb_logits, 'mb_returns': mb_returns, 'mb_dones': mb_dones, 'mb_values': mb_values, 'mb_neglogpacs': mb_neglogpacs, 'mb_rewards': mb_rewards } comm.send(trajectory, dest=learners[agent_type])
def main(args): env_name = args.env_name total_episodes = args.total_episodes time_steps = args.time_steps render = args.render run_all_envs = args.run_all_envs action_refresh_rate = args.action_refresh_rate if run_all_envs: envs_to_generate = config.train_envs else: envs_to_generate = [env_name] for current_env_name in envs_to_generate: print("Generating data for env {}".format(current_env_name)) env = make_env(current_env_name) # <1> s = 0 while s < total_episodes: episode_id = random.randint(0, 2**31 - 1) filename = DIR_NAME + str(episode_id) + ".npz" observation = env.reset() env.render() t = 0 obs_sequence = [] action_sequence = [] reward_sequence = [] done_sequence = [] reward = -0.1 done = False while t < time_steps: # and not done: if t % action_refresh_rate == 0: action = config.generate_data_action(t, env) # <2> observation = config.adjust_obs(observation) # <3> obs_sequence.append(observation) action_sequence.append(action) reward_sequence.append(reward) done_sequence.append(done) observation, reward, done, info = env.step(action) # <4> t = t + 1 if render: env.render() print("Episode {} finished after {} timesteps".format(s, t)) np.savez_compressed(filename, obs=obs_sequence, action=action_sequence, reward=reward_sequence, done=done_sequence) # <4> s = s + 1 env.close()
def main(args): env_name = args.env_name total_episodes = args.total_episodes start_batch = args.start_batch time_steps = args.time_steps render = args.render batch_size = args.batch_size run_all_envs = args.run_all_envs if run_all_envs: envs_to_generate = config.train_envs else: envs_to_generate = [env_name] for current_env_name in envs_to_generate: print("Generating data for env {}".format(current_env_name)) env = make_env(current_env_name) s = 0 batch = start_batch batch_size = min(batch_size, total_episodes) while s < total_episodes: obs_data = [] action_data = [] for i_episode in range(batch_size): print('-----') observation = env.reset() observation = config.adjust_obs(observation) # plt.imshow(observation) # plt.show() env.render() done = False action = env.action_space.sample() t = 0 obs_sequence = [] action_sequence = [] while t < time_steps: #and not done: t = t + 1 action = config.generate_data_action(t, action) obs_sequence.append(observation) action_sequence.append(action) observation, reward, done, info = env.step(action) observation = config.adjust_obs(observation) if render: env.render() obs_data.append(obs_sequence) action_data.append(action_sequence) print("Batch {} Episode {} finished after {} timesteps".format(batch, i_episode, t+1)) print("Current dataset contains {} observations".format(sum(map(len, obs_data)))) s = s + 1 print("Saving dataset for batch {}".format(batch)) np.save('./data/obs_data_' + current_env_name + '_' + str(batch), obs_data) np.save('./data/action_data_' + current_env_name + '_' + str(batch), action_data) batch = batch + 1 env.close()