def __init__(self, controller, comm_disabled=False, iteration=1000, time=0.1, name='unnamed_trial', preset='M&N, 2003', env_height=270, env_width=270, targets=True, verbose=False): """Initialize a trial.""" super().__init__() self.name = name # name for the trial self.preset = preset # preset name self.iteration = iteration # total numbers of iterations self.step_time = 0.1 # time for each iteration step in seconds self.step_fitness = [] # fitness at each timestep self.fitness = 0 # total fitness self.comm_disabled = comm_disabled self.verbose = verbose # initialize environment self.env = environment(width=env_width, height=env_height, targets=targets) # the same ann used for this trial self.ann = deepcopy(controller) # initialize agents for the trial self.env.agents = [ agent(name=self.name + 'agent0', color='red'), agent(name=self.name + 'agent1', color='orange'), agent(name=self.name + 'agent2', color='cyan'), agent(name=self.name + 'agent3', color='green') ]
def run_experiments(env_info, agent_info, num_episodes=500, experiment_name=None): env = environment() age = agent() env.env_init(env_info) age.agent_init(agent_info) for i in range(num_episodes): terminal = False last_state = env.env_start() last_action = age.agent_start(last_state) total_reward = 0 while not terminal: (reward, last_state, terminal) = env.env_step(last_action) total_reward += reward if terminal: age.agent_end(reward) else: last_action = age.agent_step(reward, last_state) values = age.agent_values() print("VALUE FUNCTION", end="\n\n") if experiment_name is not None: print(experiment_name) for i in range(env_info.get("height", 4)): for j in range(env_info.get("width", 12)): print("%7.2f" % values[i * env_info.get("width", 12) + j], end=' ') print()
import gym from Agent import agent from DQN import DQN if __name__ == '__main__': mainAgent = agent(18) targetAgent = agent(18) dqn = DQN(mainAgent, targetAgent, 0.99, 0.1, 200, 32, 64, True) dqn.train('Boxing-v0', True)
import gym from Agent import agent from DQN import DQN if __name__ == '__main__': mainAgent = agent(4) targetAgent = agent(4) dqn = DQN(mainAgent, targetAgent, 0.99, 0.01, 2000, 120, 400, True) dqn.train('CarRacing-v0', True)
from Environment import Grid from Cell import Cell from Agent import agent import pygame, time dim = 15 size = width, height = 900, 900 screen = pygame.display.set_mode(size) environment = Grid(dim) agent = agent(environment) (x, y) = (agent.agent_x, agent.agent_y) FLAT = 0.1 HILL = 0.3 FOREST = 0.7 CAVE = 0.9 def update_ui(): for i in range(dim): for j in range(dim): # print(i, j, (i*(width/dim), j*(height/dim))) if environment.field[i][ j].terrain_type == FLAT: #IF FLAT, PRINT WHITE SQUARE pygame.draw.rect(screen, (255, 255, 255), (j * (width / dim), i * (height / dim), width / dim, height / dim)) elif environment.field[i][ j].terrain_type == HILL: #IF HILL, PRINT LIGHT GREEN SQUARE pygame.draw.rect(screen, (130, 227, 2), (j * (width / dim), i *
def train(task_relation="<diedIn>", rootpath=None, epoch=5): datapath = {'type2id': rootpath + 'type2id.json', 'relation2id': rootpath + 'relation2id.json', \ 'graph': rootpath + 'graph.pkl', 'ent2type': rootpath + 'ent2type.json' ,\ 'entity2id': rootpath + 'entity2id.json'} Env_a = env(datapath) Env_a.init_relation_query_state(task_relation) batchsize = 20 maxlen = 5 po = Policy_memory(Env_a, 300, 100, Env_a.rel_num) Env_a.filter_query(maxlen, 5000) pairs = Env_a.filter_query random.shuffle(pairs) training_pairs = pairs test_pairs = pairs[:int(len(pairs) * 0.5)] reward_record = [] success_record = [] path_length = 0 valid_paris = pairs[int(len(pairs) * 0.5):int(len(pairs) * 0.6)] print('Train pairs:', len(training_pairs)) print('valid pairs:', len(valid_paris)) print('Test pairs:', len(test_pairs)) agent_a = agent(po, Env_a, policymethod='GRU') if global_device == 'cuda:0': po = po.cuda() try_count, batch_loss, ave_reward, ave_success = 0, 0, 0, 0 opt = torch.optim.Adam(agent_a.parameters() + Env_a.parameters(), lr=0.001) for ep in range(epoch): opt.zero_grad() random.shuffle(training_pairs) for query in training_pairs: try: e1, e2 = query[0], query[1] e1, e2 = Env_a.entity2id[e1], Env_a.entity2id[e2] with torch.no_grad(): traj, success = agent_a.trajectory(e1, e2, max_len=maxlen) try_count += 1 except KeyError: continue logger.MARK(Env_a.traj_for_showing(traj)) traj_loss = 0 po.zero_history() traj_reward = 0 for i in traj: ave_reward += i[4] traj_reward += i[4] loss = agent_a.update_memory_policy(i) loss.backward() traj_loss += loss.cpu() if success: ave_success += 1 path_length += len(traj) - 1 success_record.append(1) else: success_record.append(0) reward_record.append(traj_reward) batch_loss += traj_loss / len(traj) if try_count % batchsize == 0 and try_count > 0: opt.step() opt.zero_grad() logger.info( '|%d epoch|%d eposide|Batch_loss:%.4f|Ave_reward:%.3f|Ave_success:%%%.2f|ave path lenghth:%.2f|' % (ep, try_count, batch_loss * 100 / batchsize, ave_reward / batchsize, ave_success * 100 / batchsize, path_length / ave_success)) batch_loss, ave_reward, ave_success, path_length = 0, 0, 0, 0 if try_count % (20 * batchsize) == 0 and try_count > 0: valid(valid_paris, Env_a, agent_a, batchsize, maxlen) generate_paths(Env_a, agent_a, test_pairs, rootpath + task_relation + '.paths', maxlen) success = ave_smooth(success_record, 20) reward = ave_smooth(reward_record, 20) with open(rootpath + task_relation + 'sucess_record_without.txt', 'w') as fin: wstr = '\n'.join([str(i) for i in success]) fin.write(wstr) with open(rootpath + task_relation + 'reward_record_without.txt', 'w') as fin: wstr = '\n'.join([str(i) for i in reward]) fin.write(wstr) with open(rootpath + task_relation + 'test_positive_pairs', 'w') as fin: wstr = [] for i in test_pairs: wstr.append(str(i[0] + '\t' + str(i[1]))) wstr = '\n'.join(wstr) fin.write(wstr)
def train(task_relation="<diedIn>",rootpath=None,epoch=5): datapath = {'type2id': rootpath + 'type2id.json', 'relation2id': rootpath + 'relation2id.json', \ 'graph': rootpath + 'graph.pkl', 'ent2type': rootpath + 'ent2type.json' ,\ 'entity2id': rootpath + 'entity2id.json'} Env_a = env(datapath) Env_a.init_relation_query_state(task_relation) batchsize=20 maxlen=5 po = Policy_memory(Env_a,300, 100, Env_a.rel_num) # Env_a.filter_query(maxlen,5000) # pairs = Env_a.filter_query # random.shuffle(pairs) # training_pairs=pairs # test_pairs=pairs[:int(len(pairs)*0.5)] # valid_paris=pairs[int(len(pairs)*0.5):int(len(pairs)*0.6)] train_path=rootpath+'/'+task_relation+'train_pairs' valid_path = rootpath + '/' + task_relation + 'valid_pairs' training_pairs=load_pair(train_path) valid_paris=load_pair(valid_path) print('Train pairs:',len(training_pairs)) print('valid pairs:',len(valid_paris)) #print('Test pairs:',len(test_pairs)) agent_a = agent(po, Env_a,policymethod='GRU') if global_device=='cuda:0': po=po.cuda() try_count, batch_loss, ave_reward, ave_success = 0, 0, 0, 0 opt=torch.optim.Adam(agent_a.parameters()+Env_a.parameters(),lr=0.001) for ep in range(epoch): opt.zero_grad() random.shuffle(training_pairs) for query in training_pairs: try: e1, e2 = query[0], query[1] e1, e2 = Env_a.entity2id[e1], Env_a.entity2id[e2] with torch.no_grad(): traj, success = agent_a.trajectory(e1, e2,max_len=maxlen) try_count += 1 except KeyError: continue logger.MARK(Env_a.traj_for_showing(traj)) traj_loss=0 po.zero_history() traj_reward=0 for i in traj: ave_reward+=i[4] traj_reward+=i[4] loss=agent_a.update_memory_policy(i) loss.backward() traj_loss+=loss.cpu() if success: ave_success+=1 batch_loss+=traj_loss/len(traj) if try_count%batchsize==0 and try_count>0: opt.step() opt.zero_grad() logger.info('|%d epoch|%d eposide|Batch_loss:%.4f|Ave_reward:%.3f|Ave_success:%%%.2f|'%(ep,try_count,batch_loss*100/batchsize,ave_reward/batchsize,ave_success*100/batchsize)) batch_loss,ave_reward,ave_success=0,0,0 if try_count%(20*batchsize)==0 and try_count>0: valid(valid_paris,Env_a,agent_a,batchsize,maxlen) generate_paths(Env_a,agent_a,test_pairs,rootpath+task_relation+'.paths',maxlen)
x.create_common_transition( "Deterministic") # ("Bernoulli",0.7)) # "Deterministic" import Rewards sparse_reward = Rewards.Reward(grid, actions) sparse_reward.common_reward("sparse") discount = 0.2 policy = np.ones((len(grid.states), len(actions))) * 0.25 # uniform policy state_values = iterative_bellman_equation( grid, actions, policy, discount=discount) # approximate the true state values from Agent import Agent as agent episodes = agent(grid, actions, policy).sample_episode( 1000, terminal_state=16) # generate the episodes initial_states = np.zeros(len( grid.states)) # initial estimates should be an numpy array mc_estimates = monte_carlo(initial_states, episodes, discount=discount) n_estimates = n_step_td(initial_states, 50, episodes, discount=discount, learning_rate=0.001) lambda_estimate = td_lambda(initial_states, 0, episodes,
hill = [[],[],[]] frst = [[],[],[]] cave = [[],[],[]] outer_loop = 15 inner_loop = 30 start = time() for i in range(outer_loop): env = Grid(50) sum1 = 0 sum2 = 0 sum3 = 0 print(i) for j in range(inner_loop): agent1 = agent(env) agent2 = agent(env) agent3 = agent(env) temp1 = None temp2 = None temp3 = None while temp1 == None: temp1 = agent1.basic_agent(1) while temp2 == None: temp2 = agent2.basic_agent(2) while temp3 == None: temp3 = agent3.advanced_agent(2)