def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) np.random.seed(random_seed) # set the numpy seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters (0.01)
def delete_actor(data): """ Delete actor by id """ #data = get_request_data() ### YOUR CODE HERE ### Actor.delete(1) # use this for 200 response code msg = 'Record successfully deleted' return make_response(jsonify(message=msg), 200)
def __init__(self, children): self.children = set() for child in children: if isinstance(child, Actor): self.children.add(child) elif isinstance(child, User): self.children.add(Actor.by_user(child)) elif isinstance(child, Server): self.children.add(Actor.by_server(child)) elif isinstance(child, ActorCollection): self.children += child.children else: raise Error('Don\'t know what to do with %s' + child.__class__)
def delete_actor(): """ Delete actor by id """ data = get_request_data() if 'id' in data.keys(): try: row_id = int(data['id']) Actor.delete(row_id) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) msg = 'Record successfully deleted' return make_response(jsonify(message=msg), 200)
def from_user(self, receivers=None, text=None, *_): if receivers is None: return ERR_NORECIPIENT(self.command, self.actor) if text is None: return ERR_NOTEXTTOSEND(self.actor) resp = [] # TODO: check for ERR_TOOMANYTARGETS for receiver in receivers.split(','): if Channel.exists(receiver): users = [user for user in Channel.get(receiver).users if user is not self.user] resp.append(M( ActorCollection(users), self.command, str(receiver), text, prefix=str(self.user))) elif User.exists(receiver): resp.append(M( Actor.by_user(User.get(receiver)), self.command, str(receiver), text, prefix=str(self.user))) # TODO: Implement wildcards # TODO: check for ERR_WILDTOPLEVEL, RPL_AWAY, ERR_NOTOPLEVEL else: resp.append(ERR_NOSUCHNICK(receiver, self.actor)) return resp
def actor_clear_relations(): """ Clear all relations by id """ data = get_request_data() if 'id' in data.keys(): try: actor_id = int(data['id']) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) actor = Actor.clear_relations(actor_id) try: rel_actor = { k: v for k, v in actor.__dict__.items() if k in ACTOR_FIELDS } except: err = 'Record with such id does not exist' return make_response(jsonify(error=err), 400) rel_actor['filmography'] = str(actor.filmography) return make_response(jsonify(rel_actor), 200) else: err = 'No id specified' return make_response(jsonify(error=err), 400)
def dispatch(self, socket, message): actor = Actor.by_socket(socket) message.target = config.get("server", "servername") if message.command not in self.handlers: try: self.register(message.command) except ImportError, e: log.warning("Unknown command %s. Message was: %s. Error: %s" % (message.command, repr(message), e)) return
def actor_add_relation(): """ Add a movie to actor's filmography """ data = get_request_data() KEY_DICT = ['id', 'relation_id'] for dat in data: if dat not in set(KEY_DICT): err = 'Wrong key' return make_response(jsonify(error=err), 400) ### YOUR CODE HERE ### if data.get('id'): try: row_id = int(data['id']) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) obj = Actor.query.filter_by(id=data['id']).first() try: try_actor = { k: v for k, v in obj.__dict__.items() if k in ACTOR_FIELDS } except: err = 'Actor with such id does not exist' return make_response(jsonify(error=err), 400) if data.get('relation_id'): try: row_id = int(data['relation_id']) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) obj = Movie.query.filter_by(id=data['relation_id']).first() try: try_movie = { k: v for k, v in obj.__dict__.items() if k in MOVIE_FIELDS } except: err = 'Movie with such id does not exist' return make_response(jsonify(error=err), 400) else: err = 'No related_id specified' return make_response(jsonify(error=err), 400) related_movie = Movie.query.filter_by(id=data['relation_id']).first() actor = Actor.add_relation(data['id'], related_movie) # add relation here rel_actor = { k: v for k, v in actor.__dict__.items() if k in ACTOR_FIELDS } rel_actor['filmography'] = str(actor.filmography) return make_response(jsonify(rel_actor), 200) else: err = 'No id specified' return make_response(jsonify(error=err), 400)
def __init__(self, state_size, action_size, random_seed, actor_layers, critic_layers): """ Initialize an Agent object. Params ====== state_size (int): size of the environment state action_size (int): size of the environment action random_seed (int): seed for the random actor_layers (array[int]): array containing the size of each layer of the actor network critic_layers (array[int]): array containing the size of each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed random.seed(random_seed) np.random.seed(random_seed) # Actor print(f'Agent running on {DEVICE}') self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic self.critic_local = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed) # Replay Buffer self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed)
def dispatch(self, socket, message): actor = Actor.by_socket(socket) message.target = config.get('server', 'servername') if message.command not in self.handlers: try: self.register(message.command) except ImportError, e: log.warning('Unknown command %s. Message was: %s. Error: %s' % (message.command, repr(message), e)) return
def add_actor(): """ Add new actor """ data = get_request_data() # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for dat in data: if dat not in set(ACTOR_FIELDS): err = 'Wrong key' return make_response(jsonify(error=err), 400) # # if 'name' in data.keys(): # if data['name'].isdigit(): # err = 'Name must be string' # return make_response(jsonify(error=err), 400) # # if (len(data['name']) > 50): # err = 'Name must be less than 50 characters' # return make_response(jsonify(error=err), 400) # # if not data['name']: # err = 'Name cannot be null' # return make_response(jsonify(error=err), 400) # # if 'gender' in data.keys(): # if (data['gender'] != "male") and (data['gender'] != "female"): # !!!!!!!!!!!!!!!!!!!!!!!! # err = 'There are only two genders' # return make_response(jsonify(error=err), 400) # if not data['date_of_birth']: err = 'enter date' return make_response(jsonify(error=err), 400) if 'date_of_birth' in data.keys(): try: dt.strptime(data['date_of_birth'], DATE_FORMAT) except: err = 'incorrect date format(d.m.y)' return make_response(jsonify(error=err), 400) # act_dict = get_dict_of_actors() # for a in act_dict: # if (a['name'] == data['name']): # err = 'actor with that name is already exist' # return make_response(jsonify(error=err), 400) new_record = data new_record['date_of_birth'] = dt.strptime(data['date_of_birth'], '%d.%m.%Y') new_actor = Actor.create(**new_record) # !!!!!!!!!!!!!!!!!!!!!!!! new_record['id'] = new_actor.id return make_response(jsonify(new_record), 200)
def dispatch(self, socket, message): actor = Actor.by_socket(socket) message.target = config.get('server', 'servername') if message.command not in self.handlers: try: self.register(message.command) except ImportError as e: log.warning('Unknown command %s. Message was: %s. Error: %s' % (message.command, repr(message), e)) return return self.handlers[message.command].handle(actor, message)
def create_actors_acts_in_and_directors(): movies = Movies.load_all() for movie in movies: moviee = tmdb.Movies(movie.id) response = moviee.credits() for person in moviee.crew: if person['job'] == 'Director': director = Director(movie.id, person['name'], person['profile_path']) if not Director.find_by_movie_id(movie.id): director.save_to_db() for person in moviee.cast[:4]: if not Actor.load_by_id(person['id']): actor = Actor(person['id'], person['name'], person['profile_path']) actor.save_to_db() relation = Movie_Actors(movie.id, person['id']) relation.save_to_db()
def add_actor(data): """ Add new actor """ #data = get_request_data() ### YOUR CODE HERE ### # use this for 200 response code new_record = Actor.create(**data) new_actor = { k: v for k, v in new_record.__dict__.items() if k in ACTOR_FIELDS } return make_response(jsonify(new_actor), 200)
def update_actor(data): """ Update actor record by id """ #data = get_request_data() ### YOUR CODE HERE ### # use this for 200 response code upd_record = Actor.update(1, **data) upd_actor = { k: v for k, v in upd_record.__dict__.items() if k in ACTOR_FIELDS } return make_response(jsonify(upd_actor), 200)
def del_actor(): data = get_request_data() id_del = data['id'] if id_del and 'id' in data.keys(): try: row_id = int(data['id']) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) obj = Actor.query.filter_by(id=row_id).first() try: actor = { k: v for k, v in obj.__dict__.items() if k in ACTOR_FIELDS } except: err = 'Record with such id does not exist' return make_response(jsonify(error=err), 400) Actor.delete(id_del) message = "row with id has been succsessfuly deleted" return make_response(jsonify(message), 200)
def __init__(self, env, hp): self.env = env self.hp = hp self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.target_critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.target_actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.dataset = ReplayBuffer(self.hp['batch_size'], self.hp['max_buffer_size']) self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0], sigma=self.hp['noise_sigma']) self.noise.reset_states()
def add_actor(): """ Add new actor """ data = get_request_data() ### YOUR CODE HERE ### # creating date cr_date = data['date_of_birth'] cr_date = dt.strptime(cr_date, '%d.%m.%Y') cr_date = cr_date.strftime("%a, %d %b %Y %H:%M:%S" + " GMT") # use this for 200 response code new_record = Actor(name = data['name'], gender = data['gender'], date_of_birth = cr_date) new_actor = {k: v for k, v in new_record.__dict__.items() if k in ACTOR_FIELDS} return make_response(jsonify(new_actor), 200)
def from_user(self, receivers=None, text=None, *_): if receivers is None: return ERR_NORECIPIENT(self.command, self.actor) if text is None: return ERR_NOTEXTTOSEND(self.actor) resp = [] # TODO: check for ERR_TOOMANYTARGETS for receiver in receivers.split(','): if Channel.exists(receiver): channel_log = '%s/%s.log' % (config.get( 'server', 'channel_log_dir'), receiver.replace('#', '')) # if not PrivmsgCommand.channel_log_files.get(channel_log): # PrivmsgCommand.channel_log_files[channel_log] = open(channel_log,'a') # PrivmsgCommand.channel_log_files[channel_log].write("%s::%s::%s::%s\n" % ( # time.time(), time.strftime('%Y-%m-%d %H:%I:%S'), self.user.nickname, text # )) # PrivmsgCommand.channel_log_files[channel_log].flush() with open(channel_log, 'a') as f: f.write("%s::%s::%s::%s\n" % (time.time(), time.strftime('%Y-%m-%d %H:%I:%S'), self.user.nickname, text)) f.flush() users = [ user for user in Channel.get(receiver).users if user is not self.user ] resp.append( M(ActorCollection(users), self.command, str(receiver), text, prefix=str(self.user))) elif User.exists(receiver): resp.append( M(Actor.by_user(User.get(receiver)), self.command, str(receiver), text, prefix=str(self.user))) # TODO: Implement wildcards # TODO: check for ERR_WILDTOPLEVEL, RPL_AWAY, ERR_NOTOPLEVEL else: resp.append(ERR_NOSUCHNICK(receiver, self.actor)) return resp
def update_actor(): """ Update actor record by id """ data = get_request_data() if 'id' in data.keys(): try: row_id = int(data['id']) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400) else: err = 'No id specified' return make_response(jsonify(error=err), 400) keys = list(data.keys()) keys.remove('id') for i in range(len(keys)): if keys[i] == 'date_of_birth': try: data['date_of_birth'] = dt.strptime(data['date_of_birth'], '%d.%m.%Y').date() except: err = 'Wrong data format' return make_response(jsonify(error=err), 400) continue elif keys[i] == 'name': continue elif keys[i] == 'gender': continue else: err = 'Wrong keys' return make_response(jsonify(error=err), 400) try: upd_record = Actor.update(row_id, **data) upd_actor = { k: v for k, v in upd_record.__dict__.items() if k in ACTOR_FIELDS } return make_response(jsonify(upd_actor), 200) except: err = 'Record with such id does not exist' return make_response(jsonify(error=err), 400)
def actor_add_relation(): """ Add a movie to actor's filmography """ data = get_request_data() if 'id' in data.keys(): try: row_id = int(data['id']) relation_id = data['relation_id'] obj_movie = Movie.query.filter_by(id=relation_id).first() actor = Actor.add_relation(row_id, obj_movie) rel_actor = { k: v for k, v in actor.__dict__.items() if k in ACTOR_FIELDS } rel_actor['filmography'] = str(actor.filmography) return make_response(jsonify(rel_actor), 200) except: err = 'Id must be integer' return make_response(jsonify(error=err), 400)
def load(self, transformed_response): print "loading...\n" result = {} if transformed_response.get('Person'): self.entities += [Person().extract(transformed_response['Person'])] if transformed_response.get('Author'): self.entities += [Author().extract(transformed_response['Author'])] if transformed_response.get('Actor'): self.entities += [Actor().extract(transformed_response['Actor'])] if transformed_response.get('BusinessPerson'): self.entities += [ BusinessPerson().extract( transformed_response['BusinessPerson']) ] if transformed_response.get('League'): self.entities += [League().extract(transformed_response['League'])] if transformed_response.get('SportsTeam'): self.entities += [ SportsTeam().extract(transformed_response['SportsTeam']) ] if transformed_response.get('Description'): self.entities += [ Description().extract(transformed_response['Description']) ] header = str(self.query) + "(" for entity in self.entities: if entity.__class__.__name__ != "Description": header = header + " " + str(entity.__class__.__name__) header = header + ")" print "----------------------------------" print header print "----------------------------------" for entity in self.entities: entity.print_box() result[entity.__class__.__name__] = entity return result
def add_actor(): """ Add new actor """ ### YOUR CODE HERE ### data = get_request_data() if 'name' in data.keys(): if 'date_of_birth' in data.keys(): if 'gender' in data.keys(): try: data['date_of_birth'] = dt.strptime( data['date_of_birth'], '%d.%m.%Y').date() except: err = 'Wrong data format' return make_response(jsonify(error=err), 400) if data['gender'].isalpha(): new_record = Actor.create(**data) try: new_actor = { k: v for k, v in new_record.__dict__.items() if k in ACTOR_FIELDS } except: err = 'Record with such id does not exist' return make_response(jsonify(error=err), 400) return make_response(jsonify(new_actor), 200) else: err = 'Wrong gender format' return make_response(jsonify(error=err), 400) else: err = 'No gender specified' return make_response(jsonify(error=err), 400) else: err = 'No date_of_birth specified' return make_response(jsonify(error=err), 400) else: err = 'No name specified' return make_response(jsonify(error=err), 400)
def from_user(self, receivers=None, text=None, *_): if receivers is None: return ERR_NORECIPIENT(self.command, self.actor) if text is None: return ERR_NOTEXTTOSEND(self.actor) resp = [] # TODO: check for ERR_TOOMANYTARGETS for receiver in receivers.split(','): if Channel.exists(receiver): channel_log = '%s/%s.log' % ( config.get('server', 'channel_log_dir'), receiver.replace('#','')) # if not PrivmsgCommand.channel_log_files.get(channel_log): # PrivmsgCommand.channel_log_files[channel_log] = open(channel_log,'a') # PrivmsgCommand.channel_log_files[channel_log].write("%s::%s::%s::%s\n" % ( # time.time(), time.strftime('%Y-%m-%d %H:%I:%S'), self.user.nickname, text # )) # PrivmsgCommand.channel_log_files[channel_log].flush() with open(channel_log,'a') as f: f.write("%s::%s::%s::%s\n" % ( time.time(), time.strftime('%Y-%m-%d %H:%I:%S'), self.user.nickname, text )) f.flush() users = [user for user in Channel.get(receiver).users if user is not self.user] resp.append(M( ActorCollection(users), self.command, str(receiver), text, prefix=str(self.user) )) elif User.exists(receiver): resp.append(M( Actor.by_user(User.get(receiver)), self.command, str(receiver), text, prefix=str(self.user) )) # TODO: Implement wildcards # TODO: check for ERR_WILDTOPLEVEL, RPL_AWAY, ERR_NOTOPLEVEL else: resp.append(ERR_NOSUCHNICK(receiver, self.actor)) return resp
def actor_add_relation(): """ Add a movie to actor's filmography """ data = get_request_data() if 'id' in data.keys(): if 'relation_id' in data.keys(): try: actor_id = int(data['id']) except: err = "actor_id must be an integer" return make_response(jsonify(error=err), 400) try: row_m_id = int(data['relation_id']) except: err = "movie_id must be integer" return make_response(jsonify(error=err), 400) movie = Movie.query.filter_by(id=row_m_id).first() actor = Actor.add_relation(actor_id, movie) try: rel_actor = { k: v for k, v in actor.__dict__.items() if k in ACTOR_FIELDS } except: err = 'Record with such id does not exist' return make_response(jsonify(error=err), 400) rel_actor['filmography'] = str(actor.filmography) return make_response(jsonify(rel_actor), 200) else: err = 'No relation_id specified' return make_response(jsonify(error=err), 400) else: err = 'No id specified' return make_response(jsonify(error=err), 400)
def actor_add_relation(): """ Add a movie to actor's filmography """ #data = get_request_data() data = { 'name': 'Megan Fox', 'gender': 'female', 'date_of_birth': dt.strptime('16.05.1986', '%d.%m.%Y').date(), 'name': 'Transformers', 'genre': 'action', 'year': 2007 } ### YOUR CODE HERE ### movie_data = {k: v for k, v in data.items() if k in MOVIE_FIELDS} movie = Movie.create(movie_data) # use this for 200 response code actor = Actor.add_relation(data["id"], movie) # add relation here rel_actor = { k: v for k, v in actor.__dict__.items() if k in ACTOR_FIELDS } rel_actor['filmography'] = str(actor.filmography) return make_response(jsonify(rel_actor), 200)
'critic_threshold': 17.5, 'critic_suffices_required': 1, 'critic_steps_start': 200, 'critic_steps_end': 200, 'actor_steps_start': 1000, 'actor_steps_end': 1000, 'batch_size': 256, 'seed': 123456, 'replay_fill_threshold': 1., 'random_exploration': True, 'test_iterations': 30, 'validation_epoch_mod': 3, } # configuring the environment environment = gym.make('Humanoid-v3') # environment._max_episode_steps = 600 # setting up the training components agent = AWRAgent actor = Actor() critic = Critic() # training and testing Training.train((actor, critic), agent, environment, hyper_ps, save=True, debug_type=DebugType.NONE)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters self.score = 0 self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if done: reward = self.eval_episode(reward) self.add_score(reward) # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def add_score(self, reward): self.score += reward if self.best_score < self.score: self.best_score = self.score def reset_score(self): self.score = 0 def acceptable_episode(self): #print(self.task.sim.pose[:3] - self.task.target_pos) print(np.linalg.norm(self.task.sim.pose[:3] - self.task.target_pos)) def eval_episode(self, episode_reward): x = self.task.sim.pose[0] y = self.task.sim.pose[1] z = self.task.sim.pose[2] if z <= 0: episode_reward -= 35 elif z >= 145: episode_reward -= 25 if (z >= 90 and z <= 110) and (x >= -20 and x <= 20) and (y >= -20 and y <= 20): episode_reward += 40 elif (z >= 65 and z < 130) and (x >= -50 and x <= 50) and (y >= -50 and y <= 50): episode_reward += 50 return episode_reward def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) actor.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) episodes = 0 for iter in range(args.max_iter_num): actor.eval() steps = 0 scores = [] states = [] expert_actions = [] while steps < args.batch_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) states.append(state) expert_actions.append(expert_action) similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 similarity_scores.append(similarity_score) states = torch.stack(states) actions_pred , _ = actor(states) expert_actions = torch.stack(expert_actions) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train() loss = F.mse_loss(actions_pred,expert_action) actor_optim.zero_grad() actor_optim.step() # and this is basically all we need to do train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len) discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len) actor.to(device), critic.to(device), discrim.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] similarity_scores = [] while steps < args.total_sample_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) for _ in range(10000): steps += 1 mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. action = get_action(mu.cpu(), std.cpu())[0] for i in range(5): emb_sum = expert_action[i,:].sum().cpu().item() if emb_sum == 0: # print(i) action[i:,:] = 0 # manual padding break done= env.step(action) irl_reward = get_reward(discrim, state, action, args) if done: mask = 0 else: mask = 1 memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action]) score += irl_reward similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 scores.append(score) similarity_scores.append(similarity_score) score_avg = np.mean(scores) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg if args.suspend_accu_exp is not None: #only if not None do we check. if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
class DDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) np.random.seed(random_seed) # set the numpy seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # add OU noise for exploration self.noise = OUNoise(action_size, scale=1.0, sigma=.1) def reset(self): self.noise.reset() def step(self, states, actions, rewards, next_states, dones, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward (for each agent) for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and every 20 steps if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range( N_UPDATES): # generate n experiences and realize n updates experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, epsilon=0.0, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: # add a noise (based on normal distribution) to exploration actions += self.noise.noise() * epsilon return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.__update_critic_local(actions, dones, gamma, next_states, rewards, states) self.__update_actor_local(states) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def __update_critic_local(self, actions, dones, gamma, next_states, rewards, states): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def __update_actor_local(self, states): # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def network_summary(self): print('- Actor Summary (both local and target): ') self.actor_local.to(device).summary() print('- Critic Summary (both local and target): ') self.actor_local.to(device).summary() def save(self, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Save the actor and critic network weights""" torch.save(self.actor_local.state_dict(), path_result_folder(f'{checkpoint_actor_name}.pth')) torch.save(self.critic_local.state_dict(), path_result_folder(f'{checkpoint_critic_name}.pth')) @staticmethod def load(env: UnityEnvironment, random_seed=0, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Load the actor and critic network weights""" # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) action_size = brain.vector_action_space_size loaded_agent = DDPGAgent(state_size, action_size, random_seed) loaded_agent.actor_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_actor_name}.pth'))) loaded_agent.critic_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_critic_name}.pth'))) return loaded_agent
class Agent(): """ Interacts with and learn from the environment """ def __init__(self, state_size, action_size, random_seed, actor_layers, critic_layers): """ Initialize an Agent object. Params ====== state_size (int): size of the environment state action_size (int): size of the environment action random_seed (int): seed for the random actor_layers (array[int]): array containing the size of each layer of the actor network critic_layers (array[int]): array containing the size of each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed random.seed(random_seed) np.random.seed(random_seed) # Actor print(f'Agent running on {DEVICE}') self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic self.critic_local = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed) # Replay Buffer self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) def step(self, states, actions, rewards, next_states, dones, time_step): """ Save experience in replay memory, and use random sample from buffer to learn """ for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn only if there is enough samples on memory if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range(N_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True, epsilon=1.0): """ Returns actions for given state as per current policy """ state = torch.from_numpy(state).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # actions += self.noise.sample() actions += np.random.normal(0, .3) * epsilon return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Critic update actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor update actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update weights self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will copied from target_model (PyTorch model): weights will copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)