def restore_cpc(cpc, epoch, test_data, batch_size, n, k=1, folder=''): test_loss = 0 test_loader = prep_data(test_data, batch_size, k, n) batch_num = test_loader[0].shape[0] batch_num = 1 idx = 0 obs, actions, obs_pos = test_loader[0][idx], test_loader[1][ idx], test_loader[2][idx] obs_neg = get_neg_samples(obs, idx * batch_size, (idx + 1) * batch_size, test_loader[0], n, cpc.type) obs, actions, obs_pos = np.concatenate(obs), np.concatenate( actions), np.concatenate(obs_pos) obs, actions, obs_pos, obs_neg = obs[: real_batch_size], actions[: real_batch_size], obs_pos[: real_batch_size], obs_neg[: real_batch_size] obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:])) test_loss = cpc.test(obs, obs_pos, actions, obs_neg) logger.logkv("cpc restored loss", test_loss) with open(folder + 'data.pickle', 'wb') as pickle_file: pickle.dump([obs, obs_pos, actions, obs_neg], pickle_file)
def test_decoder(decoder, encoder, epoch, test_data, batch_size, include_action, n, k=1): global real_batch_size test_loader = prep_data(test_data, batch_size, k, n, decode=True) total_loss = 0 batch_num = test_loader[0].shape[0] for idx in range(batch_num): obs = test_loader[0][idx] obs = np.concatenate( obs) #real_batch_size x fixed_num_of_contact x contact_dim obs = obs[:real_batch_size] obs = torch.from_numpy(obs) obs = obs.cuda() # b x 9 * contact_dim recon = decoder(encoder(obs)) object_info = test_loader[3][idx] object_info = torch.from_numpy( np.concatenate(object_info)[:real_batch_size]).cuda() loss = ((object_info - recon)**2).mean() total_loss += loss.item() logger.logkv("decoder testing loss", total_loss) return total_loss / batch_num
def test(self, input_data, position, log_info = '', print_msg = False): feed_dict = {self.input: input_data, self.positions: position} position_loss = self.sess.run([self.position_loss], feed_dict=feed_dict)[0] logger.logkv(log_info + 'test_position_loss', position_loss) if print_msg: print(log_info + 'test_position_loss', position_loss)
def restore_predict(self, inputs, labels): prediction, test_loss = self.sess.run([self.output_ph, self.loss_ph], feed_dict={self.input_ph: inputs, self.label_ph: labels}) logger.logkv('test_loss', test_loss) return prediction, test_loss
def test_cpc(cpc, epoch, test_data, batch_size, n, k=1): start = time.time() test_loss = 0 test_loader = prep_data(test_data, batch_size, k, n) batch_num = test_loader[0].shape[0] batch_num = 20 for idx in range(batch_num): obs, actions, obs_pos = test_loader[0][idx], test_loader[1][ idx], test_loader[2][idx] obs_neg = get_neg_samples(obs, idx * batch_size, (idx + 1) * batch_size, test_loader[0], n, cpc.type) # n x 9 * contact_dim obs, actions, obs_pos = np.concatenate(obs), np.concatenate( actions), np.concatenate(obs_pos) # b x 9 * contact_dim obs_neg = obs_neg[: real_batch_size] # n x fixed_num_of_contact * contact_dim obs, actions, obs_pos = obs[: real_batch_size], actions[: real_batch_size], obs_pos[: real_batch_size] obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:])) loss = cpc.test_encoder(obs, obs_pos, actions, obs_neg) test_loss += loss test_loss /= batch_num logger.logkv("cpc testing loss", test_loss) logger.logkv("cpc testing time", time.time() - start)
def train_cpc(cpc, epoch, train_data, batch_size, n, k=1): """predict the next k steps. """ start = time.time() train_losses = [] train_loader = prep_data(train_data, batch_size, k, n) batch_num = train_loader[0].shape[0] batch_num = 100 for idx in range(batch_num): obs, actions, obs_pos = train_loader[0][idx], train_loader[1][ idx], train_loader[2][idx] obs_neg = get_neg_samples(obs, idx * batch_size, (idx + 1) * batch_size, train_loader[0], n, cpc.type) obs, actions, obs_pos = np.concatenate(obs), np.concatenate( actions), np.concatenate( obs_pos) # b x fixed_num_of_contact * contact_dim obs_neg = obs_neg[: real_batch_size] # b x n x fixed_num_of_contact * contact_dim obs, actions, obs_pos = obs[: real_batch_size], actions[: real_batch_size], obs_pos[: real_batch_size] obs_neg = obs_neg.reshape((-1, *obs_neg.shape[-2:])) loss = cpc.train_encoder(obs, obs_pos, actions, obs_neg) train_losses.append(loss) losses = np.mean(train_losses[-50:]) logger.logkv("cpc training loss", losses) logger.logkv("cpc training time", time.time() - start)
def log_diagnostics(self, paths, prefix=''): """ Log extra information per iteration based on the collected paths """ log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
def train(self, input_data, position): feed_dict = {self.input: input_data, self.positions: position} for _ in range(20): position_loss, _ = self.sess.run([self.position_loss, self.pos_op], feed_dict=feed_dict) # predictions = self.sess.run([self.predicted_pos], feed_dict=feed_dict)[0] logger.logkv('train_position_loss', position_loss)
def test(self, input_data, position, rot_matrix): feed_dict = { self.input: input_data, self.rotations: rot_matrix, #batch*3*3 self.positions: position } position_loss, rotation_loss = self.sess.run( [self.position_cls_loss, self.rotation_cls_loss], feed_dict=feed_dict) logger.logkv('test_position_loss', position_loss) logger.logkv('test_rotation_loss', rotation_loss)
def test_cpc(encoder, trans, epoch, test_data, batch_size, n, k=1, include_action=True): global real_batch_size start = time.time() encoder.eval() trans.eval() test_loss = 0 test_loader = prep_data(test_data, batch_size, k, n) batch_num = test_loader[0].shape[0] batch_num = 20 for idx in range(batch_num): obs, obs_pos = test_loader[0][idx], test_loader[2][idx] obs, obs_pos = np.concatenate(obs), np.concatenate( obs_pos) #real_batch_size x fixed_num_of_contact x contact_dim obs, obs_pos = obs[:real_batch_size], obs_pos[:real_batch_size] obs_neg = get_neg_samples( idx * batch_size, (idx + 1) * batch_size, test_loader[0], n=n, b=real_batch_size) # b x n x fixed_num_of_contact x contact_dim obs, obs_pos, obs_neg = torch.from_numpy(obs), torch.from_numpy( obs_pos), torch.from_numpy(obs_neg) obs, obs_pos = obs.cuda(), obs_pos.cuda(), # b x 9 * contact_dim obs_neg = obs_neg.cuda() # (b x n) x 9 * contact_dim if include_action: actions = test_loader[1][idx] actions = np.concatenate(actions)[:real_batch_size] actions = torch.from_numpy(actions) actions = actions.cuda() loss = compute_cpc_loss(obs, obs_pos, obs_neg, encoder, trans, actions=actions) else: loss = compute_cpc_loss(obs, obs_pos, obs_neg, encoder, trans, actions=None) test_loss += loss.item() avg_loss = test_loss / batch_num logger.logkv("cpc testing loss", avg_loss) logger.logkv("cpc testing time", time.time() - start)
def train(self, input_data, position, rot_matrix): feed_dict = { self.input: input_data, self.rotations: rot_matrix, #batch*3*3 self.positions: position } position_loss, _, rotation_loss, _ = self.sess.run([ self.position_cls_loss, self.pos_op, self.rotation_cls_loss, self.rot_op ], feed_dict=feed_dict) logger.logkv('train_position_loss', position_loss) logger.logkv('train_rotation_loss', rotation_loss)
def optimize_policy(self, buffer, timestep, grad_steps, log=True): sess = tf.get_default_session() for i in range(grad_steps): value_dict = buffer.random_batch(self.batch_size) feed_dict = create_feed_dict(placeholder_dict=self.op_phs_dict, value_dict=value_dict) sess.run(self.training_ops, feed_dict) if log: diagnostics = sess.run({**self.diagnostics_ops}, feed_dict) for k, v in diagnostics.items(): logger.logkv(k, v) if timestep % self.target_update_interval == 0: self._update_target()
def restore_predict(self, inputs, position, rot_matrix): pos, rot, position_loss, rotation_loss = self.sess.run( [ self.predicted_pos_ph, self.predicted_rot_ph, self.position_loss_ph, self.rotation_loss_ph ], feed_dict={ self.input_ph: inputs, self.rotations_ph: rot_matrix, #batch*3*3 self.positions_ph: position }) logger.logkv('restore_position_loss', position_loss) logger.logkv('restore_rotation_loss', rotation_loss)
def main(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + str(kwargs['seed']) logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: folder = './data/policy/' + kwargs['env'] paths = pickle.load(open(folder + '/paths.pickle', 'rb')) niters = paths.get_current_episode_size() // 100 train_data, test_data = split_data(paths, niters) dimo = train_data[0]['o'].shape[-1] dims = [dimo] env = gym.make(kwargs['env'], obs_type=kwargs['obs_type'], fixed_num_of_contact=kwargs['fixed_num_of_contact']) feature_net = FeatureNet( dims, fixed_num_of_contact=kwargs['fixed_num_of_contact'], contact_dim=env.contact_dim, sess=sess, output=kwargs['prediction'], process_type=kwargs['process_type'], feature_dim=kwargs['feature_dim'], feature_layer=kwargs['feature_layer']) sess.run(tf.global_variables_initializer()) for i in range(niters): start = timer.time() feature_net.train(train_data[i]) feature_net.test(test_data[i]) logger.logkv("iter", i) logger.logkv("iter_time", timer.time() - start) logger.dumpkvs() if i == 0: sess.graph.finalize()
def test_decoder(decoder, epoch, test_data, batch_size, n, k=1): start = time.time() decoder_test_loss = 0 test_loader = prep_data(test_data, batch_size, k, n, decode = True) batch_num = test_loader[0].shape[0] batch_num = 50 for idx in range(batch_num): obs = test_loader[0][idx] object_info = test_loader[3][idx] obs = np.concatenate(obs)[:real_batch_size] # b x 9 * contact_dim object_info = np.concatenate(object_info)[:real_batch_size] loss = decoder.test(obs, object_info) decoder_test_loss += loss decoder_test_loss /= batch_num logger.logkv("decoder testing loss", decoder_test_loss) logger.logkv("decoder testing time", time.time() - start)
def train_decoder(decoder, epoch, train_data, batch_size, n, k=1): """predict the next k steps. """ start = time.time() train_decoder_losses = [] train_loader = prep_data(train_data, batch_size, k, n, decode = True) batch_num = train_loader[0].shape[0] batch_num = 300 for idx in range(batch_num): obs = train_loader[0][idx] object_info = train_loader[3][idx] obs = np.concatenate(obs)[:real_batch_size] object_info = np.concatenate(object_info)[:real_batch_size] decoder_loss = decoder.train(obs, object_info) train_decoder_losses.append(decoder_loss) avg_loss = np.mean(train_decoder_losses[-50:]) logger.logkv("decoder training loss", avg_loss) logger.logkv("decoder training time", time.time() - start)
def test(self, data): feed_dict = {self.o_tf: data['o'].reshape((-1, self.dimo))} accuracy = self.sess.run([self.pred_loss], feed_dict=feed_dict) logger.logkv('test_pred_loss', accuracy[0])
def train(self, data): feed_dict = {self.o_tf: data['o'].reshape((-1, self.dimo))} loss, _ = self.sess.run([self.total_loss, self.op], feed_dict=feed_dict) logger.logkv('train_classify_loss', loss)
def obtain_samples(self, log=False, log_prefix='', random=False, deterministic=False, eval=False, multiple_trajectory=1, dynamics_model=None): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation multiple_trajectories = [] for _ in range(multiple_trajectory): paths = [] n_samples = 0 running_paths = _get_empty_running_paths_dict() if log: pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True]) # initial reset of meta_envs obs = np.asarray(self.env.reset()) ts = 0 while n_samples < self.total_samples: # execute policy t = time.time() if eval: H = self.mpc.horizon mean_list = [] std_list = [] observation = obs for t in range(H + 1): action, agent_info = policy.get_action(observation) action = agent_info['mean'] mean_list.append(action) std_list.append(agent_info['log_std']) if self.policy.squashed: action = np.tanh(action) if observation.ndim == 1: observation = observation[None] if action.ndim == 1: action = action[None] observation = dynamics_model.predict( observation, action) observation = observation.reshape((-1)) action, _ = self.mpc.get_actions(obs[None], mean_list, std_list) if action.ndim == 2: action = action[0] else: obs = obs.reshape((-1)) if random: action = self.env.action_space.sample() agent_info = {} elif deterministic: action, agent_info = policy.get_action(obs) action = agent_info['mean'] if self.policy.squashed: action = np.tanh(action) else: action, agent_info = policy.get_action(obs) if action.ndim == 2: action = action[0] policy_time += time.time() - t # step environments t = time.time() next_obs, reward, done, env_info = self.env.step(action) ts += 1 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(obs) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done or ts >= self.max_path_length: paths.append( dict( observations=np.asarray( running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=[], agent_infos=[], # env_infos=utils.stack_tensor_dict_list(running_paths["env_infos"]), # agent_infos=utils.stack_tensor_dict_list(running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() if done or ts >= self.max_path_length: next_obs = self.env.reset() ts = 0 if log: pbar.update(new_samples) n_samples += new_samples obs = next_obs multiple_trajectories.append(paths) if log: pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return multiple_trajectories
def _log_path_stats(self, multiple_trajectories, log=False, log_prefix='', return_avg_return=False, trajectory_num=1): # compute log stats trajectory_num = len(multiple_trajectories) if trajectory_num == 1: paths = multiple_trajectories[0] average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns)) return np.mean(undiscounted_returns) else: lst = [ np.mean([path["returns"][0] for path in paths]) for paths in multiple_trajectories ] average_discounted_return = sum(lst) / len(lst) lst = [[sum(path["rewards"]) for path in paths] for paths in multiple_trajectories] maxreturn = [np.max(r) for r in lst] minreturn = [np.min(r) for r in lst] stdreturn = [np.std(r) for r in lst] meanreturn = [np.mean(r) for r in lst] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(meanreturn)) logger.logkv( log_prefix + 'NumTrajs', np.mean([len(paths) for paths in multiple_trajectories])) logger.logkv(log_prefix + 'StdReturn', np.mean(stdreturn)) logger.logkv(log_prefix + 'MaxReturn', np.mean(maxreturn)) logger.logkv(log_prefix + 'MinReturn', np.mean(minreturn)) return np.mean(meanreturn)
def main(**kwargs): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) exp_dir = os.getcwd() + '/data/feature_net/' + kwargs['input_label'][ 0] + kwargs['output_label'][0] + '/' mode = kwargs['mode'][0] if mode == 'restore': rotation_saver = tf.train.import_meta_graph(exp_dir + '-999.meta') rotation_saver.restore(sess, tf.train.latest_checkpoint(exp_dir)) graph = tf.get_default_graph() with sess.as_default() as sess: input_label = kwargs['input_label'][0] output_label = kwargs['output_label'][0] buffer = {} name = '1' paths, fixed_num_of_contact = pickle.load( open( '../saved/trained/SoftHandManipulateEgg-v080-' + name + '-dict.pickle', 'rb')) for key in paths: buffer[key] = paths[key] for name in [str(i) for i in range(2, 17)]: paths, fixed_num_of_contact = pickle.load( open( '../saved/trained/SoftHandManipulateEgg-v080-' + name + '-dict.pickle', 'rb')) for key in paths: buffer[key] = np.concatenate([buffer[key], paths[key]], axis=0) env = gym.make(kwargs['env'][0], obs_type=kwargs['obs_type'][0], fixed_num_of_contact=fixed_num_of_contact) batch_size = 100 paths = data_filter(buffer, fixed_num_of_contact, batch_size) niters = paths['positions'].shape[0] // batch_size print("total iteration: ", niters) print("total number of data: ", paths['positions'].shape[0]) train_data, test_data, _, _ = split_data(paths, niters) train_data['object_position'] = train_data['object_position'][:, :, :3] test_data['object_position'] = test_data['object_position'][:, :, :3] labels_to_dims = {} labels_to_dims['positions'] = 3 rotation_model = RotationModel( dims=[labels_to_dims[input_label]], sess=sess, fixed_num_of_contact=fixed_num_of_contact, feature_layers=kwargs['feature_layers'][0], output_layers=kwargs['output_layers'][0], learning_rate=kwargs['learning_rate'][0]) if mode == 'train': sess.run(tf.global_variables_initializer()) for i in range(niters): input, out = train_data[input_label][i], train_data[ output_label][i] pred = rotation_model.train(input, out) logger.logkv("iter", i) logger.dumpkvs() rotation_model.save_model(exp_dir, 999) if mode == 'restore': rotation_model.restore() for i in range(1): logger.logkv("iter", i) _, _ = rotation_model.restore_predict( train_data[input_label][i], train_data[output_label][i]) logger.dumpkvs()
def test(self, input_data, labels): feed_dict = {self.input: input_data, self.labels: labels} accuracy = self.sess.run([self.geodesic_loss], feed_dict=feed_dict)[0] logger.logkv('test_pred_loss', accuracy)
def main(**kwargs): exp_dir = os.getcwd( ) + '/cpc_model/' + kwargs['process_type'][0] + '/n200-8' logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) obs, acts, fixed_num_of_contact = pickle.load( open('../untrained/HandManipulateEgg-v0/5seeds-dict.pickle', 'rb')) include_action = kwargs['include_action'][0] env = gym.make(kwargs['env'][0], obs_type=kwargs['obs_type'][0], fixed_num_of_contact=[fixed_num_of_contact, True]) ngeoms = env.sim.model.ngeom obs, object_info = expand_data(obs, ngeoms, fixed_num_of_contact) next_obs = obs[:, 1:] obs = obs[:, :-1] N, L, _, contact_point_dim = obs.shape N, L, action_dim = acts.shape obs_dim = (fixed_num_of_contact, contact_point_dim) z_dim = 8 lr = 1e-3 epochs = 100 batch_size = 2 n = 200 k = 1 encoder = Encoder(z_dim, obs_dim[1], fixed_num_of_contact).cuda() if include_action: trans = Transition(z_dim, action_dim).cuda() else: trans = Transition(z_dim, 0).cuda() decoder = Decoder(z_dim, 3).cuda() optim_cpc = optim.Adam(list(encoder.parameters()) + list(trans.parameters()), lr=lr) optim_dec = optim.Adam(decoder.parameters(), lr=lr) train_data, test_data = split_data([obs, acts, next_obs]) for epoch in range(epochs): train_cpc(encoder, trans, optim_cpc, epoch, train_data, batch_size, n, k, include_action) test_cpc(encoder, trans, epoch, test_data, batch_size, n, k, include_action) logger.logkv("epoch", epoch) logger.dumpkvs() train_data, test_data = split_data([obs, acts, next_obs, object_info]) for epoch in range(100): train_decoder(decoder, encoder, optim_dec, epoch, train_data, batch_size, include_action, n, k=1) test_decoder(decoder, encoder, epoch, test_data, batch_size, include_action, n, k=1) logger.logkv("epoch", epoch) logger.dumpkvs()
def main(**kwargs): z_dim = kwargs['z_dim'] trans_mode = kwargs['trans_mode'] epochs = kwargs['epochs'] include_action = kwargs['include_action'] label = kwargs['label'] dataset = kwargs['data_path'] feature_dims = kwargs['feature_dims'] mode = kwargs['mode'] n = kwargs['n'] k = kwargs['k'] encoder_lr = kwargs['encoder_lr'] decoder_lr = kwargs['decoder_lr'] decoder_feature_dims = kwargs['decoder_feature_dims'] process_type = kwargs['process_type'] if kwargs['data_path'] == '../dataset/sequence/HandManipulateEgg-v0/5seeds-dict.pickle': kwargs['dataset'] = 'trained_5seeds' elif kwargs['data_path'] == '../dataset/untrained/HandManipulateEgg-v0/5seeds-dict.pickle': kwargs['dataset'] = 'untrained_5seeds' elif kwargs['data_path'] == '../dataset/HandManipulateEgg-v09-dict.pickle': kwargs['dataset'] = 'trained_1seed' exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + str(kwargs['seed']) if kwargs['debug']: save_dir = '../saved_cpc/' + str(label) + '/' + str(kwargs['normalize_data']) + '/' + str(process_type)+ '/trained/debug' # save_dir = '../saved_cpc/' + str(label) + '/' + str(process_type)+ '/trained/debug' else: save_dir = '../saved_cpc/' + str(label) + '/' + str(kwargs['normalize_data']) + '/' + str(process_type)+ '/trained' # save_dir = '../saved_cpc/' + str(label) + '/' + str(process_type)+ '/trained' logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) obs, acts, fixed_num_of_contact = pickle.load(open(dataset, 'rb')) env = gym.make(kwargs['env'], obs_type = kwargs['obs_type'], fixed_num_of_contact = [fixed_num_of_contact, True]) ngeoms = env.sim.model.ngeom obs, object_info = expand_data(obs, ngeoms, fixed_num_of_contact) if kwargs['normalize_data']: obs = normalize_obs(obs) next_obs = obs[:, 1:] obs = obs[:, :-1] N, L, _, contact_point_dim = obs.shape N, L, action_dim = acts.shape obs_dim = (fixed_num_of_contact, contact_point_dim) train_data, test_data = split_data([obs, acts, next_obs, object_info]) batch_size = 2 if mode in ['restore', 'store_weights']: saver = tf.train.import_meta_graph(save_dir + '-999.meta') pur_save_dir = save_dir[:-8] saver.restore(sess, tf.train.latest_checkpoint(pur_save_dir)) graph = tf.get_default_graph() with sess.as_default() as sess: encoder = Encoder(z_dim, fixed_num_of_contact, contact_point_dim, feature_dims) trans = Transition(z_dim, action_dim, mode = trans_mode) cpc = CPC(sess, encoder, trans, encoder_lr, fixed_num_of_contact, contact_point_dim, action_dim, include_action = include_action, type = 1*(label=='cpc1') + 2*(label=='cpc2'), n_neg = n, process_type = process_type, mode = mode) cpc_epochs, decoder_epochs = epochs if mode == 'train': sess.run(tf.global_variables_initializer()) logger.log("training started") for epoch in range(cpc_epochs): # train_cpc(cpc, epoch, train_data, batch_size, n, k) test_cpc(cpc, epoch, test_data, batch_size, n, k) logger.logkv("epoch", epoch) logger.dumpkvs() cpc.save_model(save_dir, 999) """decoder""" logger.log("Done with cpc training.") decoder = Decoder(cpc, sess, z_dim, decoder_feature_dims, fixed_num_of_contact, contact_point_dim, decoder_lr) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) for epoch in range(decoder_epochs): train_decoder(decoder, epoch, train_data, batch_size, n, k) test_decoder(decoder, epoch, test_data, batch_size, n, k) logger.logkv("epoch", (epoch + cpc_epochs)) logger.dumpkvs() print("model saved in", save_dir) elif mode == 'restore': decoder = Decoder(cpc, sess, z_dim, decoder_feature_dims, fixed_num_of_contact, contact_point_dim, decoder_lr) uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] sess.run(tf.variables_initializer(uninit_vars)) print("initialized") for epoch in range(100): train_decoder(decoder, epoch, train_data, batch_size, n, k) test_decoder(decoder, epoch, test_data, batch_size, n, k) logger.logkv("epoch", epoch) logger.dumpkvs() print("logging to", exp_dir) elif mode == 'store_weights': old = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='') old = sess.run(old) save_dir = './saved_model/' + str(label) + '/' + str(process_type)+ '/trained/' with open(save_dir + 'weights.pickle', 'wb') as pickle_file: pickle.dump(old, pickle_file) print("weights saved to", save_dir) save_dir = '/home/vioichigo/try/tactile-baselines/saved_model/cpc2/trained' with open(save_dir + 'params.pickle', 'wb') as pickle_file: pickle.dump([z_dim, fixed_num_of_contact, contact_point_dim, action_dim, encoder_lr, feature_dims, trans_mode, label, include_action], pickle_file) tf.reset_default_graph() print("graph reset successfully")
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from tactile_baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def train(*, policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_path, demo_file, exp_dir, **kwargs): rank = MPI.COMM_WORLD.Get_rank() logger.info("Training...") best_success_rate = -1 # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers if policy.pre_train_model == 'supervised': # test_input, test_output = pickle.load(open(policy.feature_net_path + 'data.pickle', 'rb')) stored_weghts = pickle.load(open(policy.feature_net_path + 'weights.pickle', 'rb')) restored_weights = [tf.constant(w) for w in stored_weghts] """assign weights for main""" new_scope_main = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/main/pi/process/predicted_pos') update_weights_main = [tf.assign(new, old) for (new, old) in zip(new_scope_main, restored_weights)] policy.sess.run(update_weights_main) """assign weights for target""" new_scope_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/process/predicted_pos') update_weights_target = [tf.assign(new, old) for (new, old) in zip(new_scope_target, restored_weights)] policy.sess.run(update_weights_target) elif policy.pre_train_model == 'cpc': path = '/home/vioichigo/try/tactile-baselines/saved_model/cpc2/max_pool/trained/' stored_weights = pickle.load(open(path + 'weights.pickle', 'rb')) restored_weights = [tf.constant(w) for w in stored_weights] """assign weights for main""" new_scope_main = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/main/pi/process/new_cpc') update_weights_main = [tf.assign(new, old) for (new, old) in zip(new_scope_main, restored_weights)] policy.sess.run(update_weights_main) """assign weights for target""" new_scope_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/process/new_cpc') update_weights_target = [tf.assign(new, old) for (new, old) in zip(new_scope_target, restored_weights)] policy.sess.run(update_weights_target) for epoch in range(n_epochs): #200 if policy.pre_train_model != 'none': auxiliary_loss = [] start_time = time.time() # train rollout_worker.clear_history() for n_cycle in range(n_cycles): #50 episode = rollout_worker.generate_rollouts() obs = policy.store_episode(episode) start_here = time.time() for i in range(n_batches): #40 if policy.pre_train_model == 'none': policy.train() else: _, _, loss = policy.train() auxiliary_loss.append(loss) policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) logger.logkv('itr time', time.time() - start_time) if policy.pre_train_model == 'supervised': logger.logkv('auxiliary loss', np.array(auxiliary_loss).mean()) if rank == 0: log_dict = dict([]) for k in logger.Logger.CURRENT.name2val: value = logger.Logger.CURRENT.name2val[k] log_dict[k] = np.mean([value]) wandb.log(log_dict) if rank == 0: logger.dump_tabular() # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) # can't pickle SwigPyObject objects # if rank == 0 and success_rate >= best_success_rate and save_path and epoch % 10 == 0: # best_success_rate = success_rate # batch = policy.sample_batch() # policy.sess.run(policy.stage_op, feed_dict=dict(zip(policy.buffer_ph_tf, batch))) # critic_loss, actor_loss, Q_grad, pi_grad = policy._grads() # print("calculated") # if policy.pre_train_model == 'supervised': # policy.sess.run(policy.stage_op, feed_dict=dict(zip(policy.buffer_ph_tf, batch))) # feature_loss, feature_grad = policy.sess.run([policy.feature_loss_tf, policy.feature_grad_tf]) # with open(save_path + '/stats.pickle', 'wb') as pickle_file: # pickle.dump([batch, critic_loss, actor_loss, Q_grad, pi_grad, feature_loss, feature_grad], pickle_file) # else: # with open(save_path + '/stats.pickle', 'wb') as pickle_file: # pickle.dump([batch, critic_loss, actor_loss, Q_grad, pi_grad], pickle_file) # policy.o_stats.save(save_path + '/o-stats' + str(epoch) + '.pickle') # if policy.pre_train_model == 'cpc': # policy.feature_stats.save(save_path + '/feature-stats' + str(epoch) + '.pickle') # print("model saved") # actually includes the two steps above # tf_util.save_variables(save_path + '/saved' + str(epoch) + '.pkl', sess=policy.sess) if save_path and success_rate >= best_success_rate and epoch % 10 == 0: best_success_rate = success_rate tf_util.save_variables(save_path + '/saved' + str(epoch) + '-seed' + str(rank) + '.pkl', sess=policy.sess) # print("vars saved") policy.sess.run(policy.increment_global_step) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1,)) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0] return policy
def train_cpc(encoder, trans, optimizer, epoch, train_data, batch_size, n, k=1, include_action=True): """predict the next k steps. """ global real_batch_size start = time.time() encoder.train() trans.train() train_losses = [] train_loader = prep_data(train_data, batch_size, k, n) batch_num = train_loader[0].shape[0] batch_num = 100 for idx in range(batch_num): obs, obs_pos = train_loader[0][idx], train_loader[2][idx] """add batch here, so that each sample in the batch get different neg samples. """ obs, obs_pos = np.concatenate(obs), np.concatenate( obs_pos) #real_batch_size x fixed_num_of_contact x contact_dim obs, obs_pos = obs[:real_batch_size], obs_pos[:real_batch_size] obs_neg = get_neg_samples( idx * batch_size, (idx + 1) * batch_size, train_loader[0], n=n, b=real_batch_size) # b x n x fixed_num_of_contact x contact_dim obs, obs_pos, obs_neg = torch.from_numpy(obs), torch.from_numpy( obs_pos), torch.from_numpy(obs_neg) obs, obs_pos = obs.cuda(), obs_pos.cuda(), # b x 9 * contact_dim obs_neg = obs_neg.cuda() # (b x n) x 9 * contact_dim if include_action: actions = train_loader[1][idx] actions = np.concatenate(actions) actions = actions[:real_batch_size] actions = torch.from_numpy(actions) actions = actions.cuda() loss = compute_cpc_loss(obs, obs_pos, obs_neg, encoder, trans, actions=actions) else: loss = compute_cpc_loss(obs, obs_pos, obs_neg, encoder, trans, actions=None) optimizer.zero_grad() loss.backward() optimizer.step() train_losses.append(loss.item()) avg_loss = np.mean(train_losses[-50:]) logger.logkv("cpc training loss", avg_loss) logger.logkv("cpc training time", time.time() - start)
def train(self, input_data, labels): feed_dict = {self.input: input_data, self.labels: labels} loss, _ = self.sess.run([self.geodesic_loss, self.op], feed_dict=feed_dict) logger.logkv('train_classify_loss', loss)
def train(self): """ Trains policy on env using algo Pseudocode: for itr in n_itr: for step in num_inner_grad_steps: sampler.sample() algo.compute_updated_dists() algo.optimize_policy() sampler.update_goals() """ with self.sess.as_default() as sess: # initialize uninitialized vars (only initialize vars that were not loaded) sess.run(tf.global_variables_initializer()) start_time = time.time() if self.start_itr == 0: self.algo._update_target(tau=1.0) if self.n_initial_exploration_steps > 0: while self.replay_buffer._size < self.n_initial_exploration_steps: paths = self.sampler.obtain_samples( log=True, log_prefix='train-', random=True) samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='train-')[0] self.replay_buffer.add_samples( samples_data['observations'], samples_data['actions'], samples_data['rewards'], samples_data['dones'], samples_data['next_observations'], ) for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) logger.log( "Sampling set of tasks/goals for this meta-batch...") """ -------------------- Sampling --------------------------""" logger.log("Obtaining samples...") time_env_sampling_start = time.time() paths = self.sampler.obtain_samples(log=True, log_prefix='train-') sampling_time = time.time() - time_env_sampling_start """ ----------------- Processing Samples ---------------------""" # check how the samples are processed logger.log("Processing samples...") time_proc_samples_start = time.time() samples_data = self.sample_processor.process_samples( paths, log='all', log_prefix='train-')[0] self.replay_buffer.add_samples( samples_data['observations'], samples_data['actions'], samples_data['rewards'], samples_data['dones'], samples_data['next_observations'], ) proc_samples_time = time.time() - time_proc_samples_start paths = self.sampler.obtain_samples(log=True, log_prefix='eval-', deterministic=True) _ = self.sample_processor.process_samples( paths, log='all', log_prefix='eval-')[0] # self.log_diagnostics(paths, prefix='train-') """ ------------------ Policy Update ---------------------""" logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. time_optimization_step_start = time.time() self.algo.optimize_policy(self.replay_buffer, itr * self.epoch_length, self.num_grad_steps) """ ------------------- Logging Stuff --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('Time-Optimization', time.time() - time_optimization_step_start) logger.logkv('Time-SampleProc', np.sum(proc_samples_time)) logger.logkv('Time-Sampling', sampling_time) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.dumpkvs() if itr == 0: sess.graph.finalize() logger.log("Training finished") self.sess.close()
def obtain_samples(self, log=False, log_prefix='', random=False, deterministic=False, verbose=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] if verbose: pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.vec_env.num_envs) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) while n_samples < self.total_samples: # execute policy t = time.time() if self.vae is not None: obses = np.array(obses) obses = self.vae.encode(obses) if random: actions = np.stack([ self.env.action_space.sample() for _ in range(self.vec_env.num_envs) ], axis=0) agent_infos = {} elif deterministic: actions, agent_infos = policy.get_actions(obses) actions = [a_i['mean'] for a_i in agent_infos] if self.policy.squashed: actions = np.tanh(actions) else: actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() if verbose: pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses if verbose: pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths