def __init__(self, args): self.args = args ######### Initialize the Multiagent Team of agents ######## if self.args.ps == 'full' or self.args.ps == 'trunk': self.agents = [Agent(self.args, id)] #todo: is it one agent or more>? is it just agent? sharing all parameters elif self.args.ps == 'none': self.agents = [Agent(self.args, id) for id in range(self.args.config.num_agents)] # neural network for each agent else: sys.exit('Incorrect PS choice') self.test_agent = TestAgent(self.args, 991) ###### Buffer and Model Bucket as references to the corresponding agent's attributes #### if args.ps == "trunk": self.buffer_bucket = [buffer.tuples for buffer in self.agents[0].buffer] else: self.buffer_bucket = [ag.buffer.tuples for ag in self.agents] # Specifying 3 different networks for evo, PG and test rollouts self.popn_bucket = [ag.popn for ag in self.agents] self.rollout_bucket = [ag.rollout_actor for ag in self.agents] self.test_bucket = self.test_agent.rollout_actor ######### EVOLUTIONARY WORKERS ############ if self.args.popn_size > 0: self.evo_task_pipes = [Pipe() for _ in range(args.popn_size * args.num_evals)] # evals for computing the fitness self.evo_result_pipes = [Pipe() for _ in range(args.popn_size * args.num_evals)] self.evo_workers = [Process(target=rollout_worker, args=( self.args, i, 'evo', self.evo_task_pipes[i][1], self.evo_result_pipes[i][0], self.buffer_bucket, self.popn_bucket, True, RANDOM_BASELINE)) for i in range(args.popn_size * args.num_evals)] # rollout for pop_size*num_evals, # popn_bucket is the neural network for evo for worker in self.evo_workers: worker.start() ######### POLICY GRADIENT WORKERS ############ if self.args.rollout_size > 0: self.pg_task_pipes = Pipe() self.pg_result_pipes = Pipe() self.pg_workers = [ Process(target=rollout_worker, args=(self.args, 0, 'pg', self.pg_task_pipes[1], self.pg_result_pipes[0], self.buffer_bucket, self.rollout_bucket, self.args.rollout_size > 0, RANDOM_BASELINE))] # rollout_bucket is the neural network for evo for worker in self.pg_workers: worker.start() ######### TEST WORKERS ############ self.test_task_pipes = Pipe() self.test_result_pipes = Pipe() self.test_workers = [Process(target=rollout_worker, args=(self.args, 0, 'test', self.test_task_pipes[1], self.test_result_pipes[0], None, self.test_bucket, False, RANDOM_BASELINE))] # test_bucket is the neural network for evo for worker in self.test_workers: worker.start() #### STATS AND TRACKING WHICH ROLLOUT IS DONE ###### self.best_score = -999; self.total_frames = 0; self.gen_frames = 0; self.test_trace = []
def gen_traj_loop(): n = 100 agent = Agent(env_factory, policy_list[0], running_state=running_state_list[0], render=args.render, num_threads=args.num_threads, mode_list=args.mode_list, state_type=args.state_type, num_steps_per_mode=args.num_steps_per_mode) env_data_dict = {'num_goals': 1} expert_data_dict = {} i_iter = 0 print('Writing to h5 file ...') while i_iter < n: #vid_folder = str(i_iter) vid_folder = None path_key = str(i_iter) + '_0' returned_dict, save_flag = agent.generate_mixed_expert_trajs( policy_list, running_state_list, vid_folder=vid_folder) if save_flag: expert_data_dict[path_key] = returned_dict i_iter += 1 print(i_iter) save_expert_traj_dict_to_h5(expert_data_dict, args.traj_save_dir)
def setUp(self): time_step = 0.05 self.agents = [Agent(0, 0, time_step)] self.arena = Arena(10, 20) self.biased_grid = BiasedGrid(self.arena.get_dimensions()) self.algorithm = AStar(self.arena, self.agents, self.biased_grid) self.cli = CommandLine(self.arena, self.agents)
def abort(): job_id = request.args.get('job_id') # todo - need to fix this, it isn't being processed properly (parallelism?) # set_job(job_id, {'job_status': 'requested_abort'}) Agent(job_id=job_id, role='execute').report_job(job_id, 'requested aborting') return {}
def learn_pairs(label_interest=5, n_jumps_test=50): """ :param label_interest: MNIST label of interest :param n_jumps_test: how many test saccades to be made for one image; as we increase `n_jumps_test`, we expect overlap with L23 train history to decrease in time, since during the training we observe only the most significant features in an image. Ideally, we'd like the overlap not to decrease much in time. """ images, labels = load_mnist.load_images(images_number=100) world = World() poppy = Agent() images_interest = images[labels == label_interest] for image in images_interest: world.add_image(image) poppy.cortex.reset_activations() l23_train = poppy.learn_pairs(world, label_interest) world.reset() if n_jumps_test == 0: l23_test = poppy.learn_pairs(world, label=label_interest) else: l23_test = [] poppy.sense_data(world) for saccade in range(n_jumps_test): poppy.sense_data(world) l23_test.append(poppy.cortex.V1.layers['L23'].cells.copy()) l23_test = np.vstack(l23_test) overlap = np.dot(l23_train, l23_test.T) overlap = (overlap * 255 / poppy.cortex.V1.layers['L23'].n_active).astype(np.uint8) cv2.imshow('overlap', overlap) cv2_step()
def disable_agent(): set_global('agent_status', 'disabled') for job in get_db('jobs'): if job['status'] != 'completed': agent = Agent(job['id']) assign_agent = job['assign_agent'] agent.deorchestrate(assign_agent['url'], assign_agent['port']) return {}
def main(args): logger = create_logger(build_expname, args) initialize_logger(logger) """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) exp = Experiment(agent, env, logger, args) exp.main_loop()
def _read_json(self, filepath): print('Importing file', filepath) with open(filepath) as f: data = json.load(f) for varname in data['variables']: name = varname domain = data['variables'][varname]['domain'] id = data['variables'][varname]['id'] constr = data['variables'][varname]['cons'] type = data['variables'][varname]['type'] agent = data['variables'][varname]['agent'] self.variables[name] = Variable(name=name, domain=domain, type='decision') for con in data['constraints']: name = con scope = data['constraints'][con]['scope'] costs = data['constraints'][con]['vals'] domains = [self.variables[vname].domain for vname in scope] all_tuples = list(product(*domains)) assert (len(all_tuples) == len(costs)) con_values = { all_tuples[i]: costs[i] for i in range(len(all_tuples)) } self.constraints[name] = Constraint( name, scope=[self.variables[vid] for vid in scope], values=con_values) # add constriant to variables for vid in scope: self.variables[vid].addConstraint(self.constraints[name]) for agt in data['agents']: name = agt var_names = data['agents'][agt]['vars'] agt_constraints = [ ] #list(set([c for c in self.variables[vid].constraints for vid in var_names])) for vid in var_names: for c in self.variables[vid].constraints: if c not in agt_constraints: agt_constraints.append(c) self.agents[name] = Agent( name, variables=[self.variables[vid] for vid in var_names], constraints=agt_constraints) for vid in var_names: self.variables[vid].setOwner(self.agents[name]) # Connect neighbors: for con in self.constraints: clique = [var.controlled_by for var in self.constraints[con].scope] for ai, aj in permutations(clique, 2): ai.addNeighbor(aj, self.constraints[con])
def init_agent(): services, workers, session, _ = parse_old_config() endpoint = Service('cmd_responder', EventSetOutputConnector('cmd_responder').send, StateManager.save_dialog_dict, 1, ['responder']) input_srv = Service('input', None, StateManager.add_human_utterance_simple_dict, 1, ['input']) pipeline = Pipeline(services) pipeline.add_responder_service(endpoint) pipeline.add_input_service(input_srv) agent = Agent(pipeline, StateManager()) return agent, session
def run(): from core.agent import Agent from core.state_manager import StateManager from core.skill_manager import SkillManager from core.rest_caller import RestCaller from core.service import Service from core.postprocessor import DefaultPostprocessor from core.response_selector import ConfidenceResponseSelector from core.skill_selector import ChitchatQASelector from core.config import MAX_WORKERS, ANNOTATORS, SKILL_SELECTORS, SKILLS import logging logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) state_manager = StateManager() anno_names, anno_urls = zip(*[(annotator['name'], annotator['url']) for annotator in ANNOTATORS]) preprocessor = Service( rest_caller=RestCaller(max_workers=MAX_WORKERS, names=anno_names, urls=anno_urls)) postprocessor = DefaultPostprocessor() skill_caller = RestCaller(max_workers=MAX_WORKERS) response_selector = ConfidenceResponseSelector() ss_names, ss_urls = zip(*[(annotator['name'], annotator['url']) for annotator in SKILL_SELECTORS]) skill_selector = ChitchatQASelector(rest_caller=RestCaller(max_workers=MAX_WORKERS, names=ss_names, urls=ss_urls)) skill_manager = SkillManager(skill_selector=skill_selector, response_selector=response_selector, skill_caller=skill_caller, profile_handlers=[skill['name'] for skill in SKILLS if skill.get('profile_handler')]) agent = Agent(state_manager, preprocessor, postprocessor, skill_manager) def infer(messages: Collection[Message], dialog_ids): utterances: List[Optional[str]] = [message.text for message in messages] tg_users: List[User] = [message.from_user for message in messages] u_tg_ids = [str(user.id) for user in tg_users] u_tg_data = [{ 'id': user.id, 'username': user.username, 'first_name': user.first_name, 'last_name': user.last_name } for user in tg_users] u_d_types = [None] * len(messages) date_times = [datetime.utcnow()] * len(messages) locations: List[Optional[Location]] = [message.location for message in messages] ch_types = ['telegram'] * len(messages) answers = agent(utterances=utterances, user_telegram_ids=u_tg_ids, user_device_types=u_d_types, date_times=date_times, locations=locations, channel_types=ch_types) return answers return infer
def _create_agents(self, n): """ Creates an agent :param n: :return: """ name, vid = 'a_' + str(n), 'v_' + str(n) self.agents[name] = Agent(name, variables=[self.variables[vid]], constraints=self.variables[vid].constraints) self.variables[vid].setOwner(self.agents[name])
def setup(self): nbr_agents = SimEngine.get_gui_value('nbr_agents') for _ in range(nbr_agents): # When created, a agent adds itself to self.agents and to its patch's list of Agents. # self.agent_class(scale=1) Agent(scale=1) initial_velocities = cycle([Velocity((-1, -1)), Velocity((-1, 1)), Velocity((0, 0)), Velocity((1, -1)), Velocity((1, 1))]) for (agent, vel) in zip(World.agents, initial_velocities): agent.set_velocity(vel)
def orchestrate(job): job_id = job.job_id agent = Agent(job_id=job_id, role='orchestrate') job.set('start_time', time.time()) git_repo = job.get('git_repo') file_name = job.get('file_name') agent.log(f'orchestrating', report=True, job_id=job_id) exec_agents = json.loads( requests.get(f'http://{get_global("tracker_host")}:3000/assign_agents', params={ 'source': job.get('submitter_name'), 'orchestrator': get_global('agent_name'), 'required': 2 }).content.decode("ascii")) job.set('executors', exec_agents) agent.report_job(job_id, f'executors: {exec_agents}') for exec_agent in exec_agents: agent.log(f'sending to executor: {exec_agent["name"]}', report=True, job_id=job_id) time.sleep(5) try: requests.get( f'http://{exec_agent["url"]}:{exec_agent["port"]}/execute', params={ 'git_repo': git_repo, 'file_name': file_name, 'job_id': job_id, 'submission_time': job.get('submission_time'), 'submitter_name': job.get('submitter_name'), 'submitter_url': job.get('submitter_url'), 'submitter_port': job.get('submitter_port'), 'orchestrator_name': get_global('agent_name'), 'orchestrator_url': get_global('agent_url'), 'orchestrator_port': get_global('agent_port') }, timeout=0.0000000001) except requests.exceptions.ReadTimeout: pass time.sleep(3) Thread(target=sync, kwargs={'job_id': job_id}).start() set_global('agent_status', 'connected') return f'sent job {job.job_id} to executors: {exec_agents}'
def prepare_agent(services, endpoint: Service, input_serv: Service, use_response_logger: bool): pipeline = Pipeline(services) pipeline.add_responder_service(endpoint) pipeline.add_input_service(input_serv) if use_response_logger: response_logger_callable = response_logger else: response_logger_callable = None agent = Agent(pipeline, StateManager(), response_logger_callable=response_logger_callable) return agent.register_msg, agent.process
def submit(job): agent = Agent(job_id=job.job_id, role='submit') try: agent.report_job(job.job_id, 'submitting') # todo - add tracker object orchestrator_agent = request_orchestrator(agent, 1, job_id=job.job_id) job.set('assigned_agent', orchestrator_agent) # agent.report(f'sending job: {job.job_id}, to orchestrator: {orchestrator_agent}', job_id=job.job_id) agent.report_job(job.job_id, f'sending to orchestrator: {orchestrator_agent}') submission_time = str(datetime.datetime.now()) # todo - make sure api call is not waiting for response, then have job.set_may after call job_params = { 'job_status': 'submitted', 'submission_time': str(datetime.datetime.now()), } job.set_many(job_params) # todo - this is an agent skill # todo - handle async try: requests.get(f'http://{orchestrator_agent["url"]}:{orchestrator_agent["port"]}/orchestrate', params={ 'git_repo': job.get("git_repo"), 'file_name': job.get("file_name"), 'job_id': job.job_id, 'submission_time': submission_time, 'submitter_name': get_global('agent_name'), 'submitter_url': get_global('agent_url'), 'submitter_port': get_global('agent_port') }, timeout=0.0000000001) except requests.exceptions.ReadTimeout: pass agent.set('agent_status', 'connected') except Exception as e: agent.log(e) return f'error submitting job {job.job_id}: {e}' return { 'status': 'submitted', 'timestamp': submission_time, 'orchestrator': orchestrator_agent, 'job_id': job.job_id }
def __init__(self, args): self.args = args ######### Initialize the Multiagent Team of agents ######## self.agents = Agent(self.args, id) self.prey_agent = PreyAgent(self.args, -1) self.test_agent = TestAgent(self.args, 991) ###### Buffer and Model Bucket as references to the corresponding agent's attributes #### self.predator_buffer_bucket = [buffer.tuples for buffer in self.agents.buffer] self.prey_buffer_bucket = [self.prey_agent.buffer[0].tuples] self.popn_bucket = self.agents.popn self.predator_rollout_bucket = self.agents.rollout_actor self.prey_rollout_bucket = self.prey_agent.rollout_actor self.predator_test = self.test_agent.predator self.prey_test = self.test_agent.prey ######### EVOLUTIONARY WORKERS ############ if self.args.popn_size > 0: self.evo_task_pipes = [Pipe() for _ in range(args.popn_size * args.num_evals)] self.evo_result_pipes = [Pipe() for _ in range(args.popn_size * args.num_evals)] self.evo_workers = [Process(target=rollout_worker, args=( self.args, i, 'evo', self.evo_task_pipes[i][1], self.evo_result_pipes[i][0], self.predator_buffer_bucket, self.prey_buffer_bucket, self.popn_bucket, self.prey_rollout_bucket, True, args.config.config)) for i in range(args.popn_size * args.num_evals)] for worker in self.evo_workers: worker.start() ######### POLICY GRADIENT WORKERS ############ if self.args.rollout_size > 0: self.pg_task_pipes = Pipe() self.pg_result_pipes = Pipe() self.pg_workers = [ Process(target=rollout_worker, args=(self.args, 0, 'pg', self.pg_task_pipes[1], self.pg_result_pipes[0], self.predator_buffer_bucket, self.prey_buffer_bucket, self.predator_rollout_bucket, self.prey_rollout_bucket, self.args.rollout_size > 0, args.config.config))] for worker in self.pg_workers: worker.start() ######### TEST WORKERS ############ self.test_task_pipes = Pipe() self.test_result_pipes = Pipe() self.test_workers = [Process(target=rollout_worker, args=(self.args, 0, 'test', self.test_task_pipes[1], self.test_result_pipes[0], None, None, self.predator_test, self.prey_test, False, args.config.config))] for worker in self.test_workers: worker.start() #### STATS AND TRACKING WHICH ROLLOUT IS DONE ###### self.best_score = -999; self.total_frames = 0; self.gen_frames = 0; self.test_trace = []
def create_agents(self): """ create agent list :return: list containing all generated agents """ # create agenets for count in range(self.number_of_agents): self.agent_list.append(Agent(count, True)) logging.debug("agent list created with " + str(len(self.agent_list)) + " agents") return self.agent_list
def one_image(label_interest=5): images, labels = load_mnist.load_images(images_number=100) world = World() poppy = Agent() image_interest = images[labels == label_interest][0] world.add_image(image_interest) poppy.cortex.reset_activations() poppy.cortex.display = True while True: poppy.sense_data(world) poppy.cortex.associate(label=label_interest)
def main(cfg): env_name, gamma, tau, policy_state, filter_state = \ cfg.require("env name", "advantage gamma", "advantage tau", "policy state dict", "filter state dict") filter_op = ZFilter(gamma, tau) # env = FakeGym(env_name) env = FakeRLBench(env_name) policy = Policy(cfg, env.info()) agent = Agent(cfg, env, policy, filter_op) # ---- start training ---- # if policy_state is not None: agent.policy().reset(policy_state) if filter_state is not None: agent.filter().reset(filter_state) print("Info: Start replaying saved model") replay_loop(cfg, agent) print("Done")
def main(cfg): env_name, use_zf, gamma, tau, policy_state, filter_state =\ cfg.require("env name", "use zfilter", "advantage gamma", "advantage tau", "policy state dict", "filter state dict") logger = Logger() logger.init(cfg) filter_op = ZFilter(gamma, tau, enable=use_zf) env = FakeGym(env_name) policy = Policy(cfg, env.info()) agent = Agent(cfg, env, policy, filter_op) # ---- start training ---- # if policy_state is not None: agent.policy().reset(policy_state) if filter_state is not None: agent.filter().reset(filter_state) train_loop(cfg, agent, logger) print("Done")
def complete(): try: job_id = request.args.get('job_id') completing_agent = request.args.get('agent_name') job_params = { 'job_status': 'completed', 'completion_time': request.args.get('completion_time'), 'executor_name': request.args.get('executor_name'), 'executor_url': request.args.get('executor_url'), 'executor_port': request.args.get('executor_port') } set_job(job_id, job_params) if get_job(job_id)['role'] == 'orchestrate': agent = Agent(job_id) for executor in list(get_job(job_id)['executors']): if executor['name'] != completing_agent: agent.request_abort(job_id, executor['url'], executor['port']) return str(job_params) except Exception as e: log.exception('unable to complete') return {}
def main(cfg): env_name, action_mode, policy_state, filter_state =\ cfg.require("env name", "action mode", "policy state dict", "filter state dict") logger = Logger() logger.init(cfg) filter_op = Filter() # env = FakeGym(env_name) env = FakeRLBench(env_name, action_mode=action_mode) policy = Policy(cfg, env.info()) agent = Agent(cfg, env, policy, filter_op) # ---- start training ---- # if policy_state is not None: agent.policy().reset(policy_state) if filter_state is not None: agent.filter().reset(filter_state) train_loop(cfg, agent, logger) print("Done")
def test(world, restore=False, show=True): """ Run BECCA with a world If restore is True, this method loads a saved agent if it can find one. Otherwise it creates a new one. It connects the agent and the world together and runs them for as long as the world dictates. """ agent_name = '_'.join((world.name, 'agent')) agent = Agent(world.num_sensors, world.num_actions, agent_name=agent_name, show=show) if restore: agent = agent.restore() actions = np.zeros((world.num_actions, 1)) # Repeat the loop through the duration of the existence of the world while (world.is_alive()): sensors, reward = world.step(actions) world.visualize(agent) actions = agent.step(sensors, reward) return agent.report_performance()
print("Device: " + str(args.device)) print("Seed: " + str(args.seed)) print("Number of CPU threads: " + str(args.num_threads)) print("Number of Expert Trajectories: " + str(args.num_trajs)) print("---------------------------------------") print("Training DRIL Imitator for {} Epochs".format(args.max_iter_num)) # create imitator imitator = DRIL(args, state_dim, action_dim, is_disc_action) imitator.set_expert(expert_traj, args.num_trajs) print("Starting Ensemble!") imitator.train_ensemble() print("Finished Ensemble!") # create agent agent = Agent(env, imitator.policy.actor, args.device, running_state=running_state, render=args.render, num_threads=args.num_threads) log_list = {"bc_loss": [], "uncertainty_cost":[], "avg_reward": [], "std_reward": []} total_timesteps = 0 for i_iter in range(args.max_iter_num): batch, log = agent.collect_samples(args.min_batch_size) # train DRIL t0 = time.time() loss = imitator.train(batch) t1 = time.time()
env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent""" agent = Agent(env_factory, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """perform TRPO update"""
if args.use_running_state: running_state = ZFilter((state_dim,), clip=5) # running list of states that allows to access precise mean and std else: running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) policy_net = Policy(state_dim, action_dim, log_std=args.log_std) value_net = Value(state_dim) policy_net.to(device) value_net.to(device) agent_trpo = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=1) def update_params_trpo(batch): # (3) states = torch.from_numpy(np.stack(batch.state)).to(args.dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(args.dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(args.dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(args.dtype).to(device) with torch.no_grad(): values = value_net(states) # estimate value function of each state with NN """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """perform TRPO update""" trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl_trpo, args.damping, args.l2_reg)
policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) # optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) # optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) # optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) # optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01) """create agent""" agent = Agent(env, policy_mgr, policy_wrk, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch_mgr, batch_wrk): states_mgr = torch.from_numpy(np.stack( batch_mgr.state)).to(dtype).to(device) directions = torch.from_numpy(np.stack( batch_mgr.action)).to(dtype).to(device) rewards_mgr = torch.from_numpy(np.stack( batch_mgr.reward)).to(dtype).to(device) masks_mgr = torch.from_numpy(np.stack(batch_mgr.mask)).to(dtype).to(device) states_wrk = torch.from_numpy(np.stack(
policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 1000 """create agent""" agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads, logger=logger_data) def update_params(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64)) if use_gpu: states, actions, rewards, masks = states.cuda(), actions.cuda( ), rewards.cuda(), masks.cuda() values = value_net(Variable(states, volatile=True)).data fixed_log_probs = policy_net.get_log_prob(Variable(states, volatile=True), Variable(actions)).data
optim_batch_size = 64 # load trajectory expert_traj, running_state = pickle.load(open(args.expert_traj_path, "rb")) def expert_reward(state, action): state_action = Tensor(np.hstack([state, action])) return -math.log( discrim_net(Variable(state_action, volatile=True)).data.numpy()[0]) """create agent""" agent = Agent(env_factory, policy_net, custom_reward=expert_reward, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): states = Tensor(batch.state) actions = ActionTensor(batch.action) rewards = Tensor(batch.reward) masks = Tensor(batch.mask) values = value_net(Variable(states, volatile=True)).data fixed_log_probs = policy_net.get_log_prob(Variable(states, volatile=True), Variable(actions)).data """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, Tensor)
random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # load trajectory args.expert_traj_path = "assets/expert_traj/{}_ppo_0.p".format( args.env_name) expert_trajs, _, _ = pickle.load(open(args.expert_traj_path, "rb")) imitator = GAILAtari(args, state_dim, action_dim) imitator.set_expert(expert_trajs, args.num_trajs) # create agent agent = Agent(env, imitator.policy.actor, args.device, custom_reward=imitator.expert_reward, render=args.render, num_threads=args.num_threads) print("=======================================") print("Task: " + args.env_name) print("Settings: " + args.expert_traj_path) print("Action Dimension: " + str(action_dim)) print("---------------------------------------") print("Device: " + str(args.device)) print("Seed: " + str(args.seed)) print("Number of Expert Trajectories: " + str(args.num_trajs)) print("---------------------------------------") print("Training Behavior Cloning Imitator {} Epochs".format( args.max_iter_num))