def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log(args.log, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() S = set() corWs = queue.Queue() # add two extreme points corWs.put(FloatTensor([1.0, 0.0])) corWs.put(FloatTensor([0.0, 1.0])) # outer_loop! for _ in range(args.ws): print(colored("size of corWs: {}".format(corWs.qsize()), "green")) if corWs.qsize() == 0: corWs.put(FloatTensor([1.0, 0.0])) corWs.put(FloatTensor([0.0, 1.0])) corner_w = corWs.get_nowait() while not is_corner(corner_w, S) and corWs.qsize()>0: corner_w = corWs.get_nowait() print(colored("{} left....".format(corWs.qsize()), "green")) if not is_corner(corner_w, S): print(colored("no more corner w...", "green")) print(colored("Final S contains", "green")) for s in S: print(colored(s, "green")) break print(colored("solve for w: {}".format(corner_w), "green")) for num_eps in range(int(args.episode_num / args.ws)): terminal = False env.reset() loss = 0 cnt = 0 tot_reward = 0 tot_reward_mo = 0 probe = None if args.env_name == "dst": probe = corner_w elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: state = env.observe() action = agent.act(state, corner_w) agent.w_kept = corner_w next_state, reward, terminal = env.step(action) if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) agent.memorize(state, action, next_state, reward, terminal, roi=True) loss += agent.learn(corner_w) if cnt > 100: terminal = True agent.reset() tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt) tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt) cnt = cnt + 1 _, q = agent.predict(probe) if args.env_name == "dst": act_1 = q[0, 3] act_2 = q[0, 1] elif args.env_name in ['ft', 'ft5', 'ft7']: act_1 = q[0, 1] act_2 = q[0, 0] if args.method == "crl-naive": act_1 = act_1.data.cpu() act_2 = act_2.data.cpu() elif args.method == "crl-envelope": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) elif args.method == "crl-energy": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) print("end of eps %d with total reward (1) %0.2f (%0.2f, %0.2f), the Q is %0.2f | %0.2f; loss: %0.4f" % ( num_eps, tot_reward, tot_reward_mo[0], tot_reward_mo[1], act_1, act_2, # q__max, loss / cnt)) monitor.update(num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) # agent.is_train=False terminal = False env.reset() cnt = 0 tot_reward_mo = 0 while not terminal: state = env.observe() action = agent.act(state, corner_w) agent.w_kept = corner_w next_state, reward, terminal = env.step(action) if cnt > 100: terminal = True agent.reset() tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt) cnt = cnt + 1 agent.is_train=True S, corWs = update_ccs(S, corWs, tot_reward_mo) print(colored("----------------\n", "red")) print(colored("Current S contains", "red")) for s in S: print(colored(s, "red")) print(colored("----------------\n", "red")) # if num_eps+1 % 100 == 0: # agent.save(args.save, args.model+args.name+"_tmp_{}".format(number)) agent.save(args.save, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log( args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() for num_eps in range(args.episode_num): terminal = False env.reset() loss = 0 cnt = 0 tot_reward = 0 probe = None if args.env_name == "dst": probe = FloatTensor([0.8, 0.2]) elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: state = env.observe() action = agent.act(state) next_state, reward, terminal = env.step(action) if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) agent.memorize(state, action, next_state, reward, terminal) loss += agent.learn() if cnt > 100: terminal = True agent.reset() tot_reward = tot_reward + ( probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt) cnt = cnt + 1 _, q = agent.predict(probe) if args.env_name == "dst": act_1 = q[0, 3] act_2 = q[0, 1] elif args.env_name in ['ft', 'ft5', 'ft7']: act_1 = q[0, 1] act_2 = q[0, 0] if args.method == "crl-naive": act_1 = act_1.data.cpu() act_2 = act_2.data.cpu() elif args.method == "crl-envelope": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) elif args.method == "crl-energy": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) print( "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f" % ( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt)) monitor.update( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) if num_eps + 1 % 500 == 0: agent.save( args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log( args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() initial_state = env.observe() for num_eps in range(args.episode_num): terminal = False env.reset() loss = 0 cnt = 0 act1 = 0 act2 = 0 tot_reward = 0 tot_reward_nc = 0 tot_reward_dist = 0 mask = None next_mask = None probe = None if args.env_name == "dst": probe = FloatTensor([0.8, 0.2]) elif args.env_name == "crp": probe = FloatTensor([0.5, 0.5]) elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: t_now = time.time() state = env.observe() t_obs = time.time() - t_now t_now = time.time() if args.env_name == "crp": mask = env.env.get_action_out_mask() action = agent.act(state, mask=mask) t_policy = time.time() - t_now t_now = time.time() next_state, reward, terminal = env.step(action, step=0.5) t_step = time.time() - t_now if args.env_name == "crp": next_mask = env.env.get_action_out_mask() if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) t_now = time.time() agent.memorize(state, action, next_state, reward, terminal, mask, next_mask) t_mem = time.time() - t_now t_now = time.time() loss += agent.learn() t_learn = time.time() - t_now if terminal: # terminal = True t_now = time.time() agent.reset() t_reset = time.time() - t_now tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) act1 += reward[0] act2 += reward[1] tot_reward_nc = tot_reward_nc + 1 - reward[0] tot_reward_dist = tot_reward_dist + env.env.get_distortion( absolute=True, tollerance=0) / 10 cnt = cnt + 1 # _, q = agent.predict(probe, initial_state=initial_state) # if args.env_name == "dst": # act_1 = q[0, 3] # act_2 = q[0, 1] if args.env_name == "crp": act_1 = act1 act_2 = act2 # elif args.env_name in ['ft', 'ft5', 'ft7']: # act_1 = q[0, 1] # act_2 = q[0, 0] # if args.method == "crl-naive": # act_1 = act_1.data.cpu() # act_2 = act_2.data.cpu() # elif args.method == "crl-envelope": # act_1 = probe.dot(act_1.data) # act_2 = probe.dot(act_2.data) # elif args.method == "crl-energy": # act_1 = probe.dot(act_1.data) # act_2 = probe.dot(act_2.data) print( "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f; total_nc: %0.2f; total_dist: %0.2f;beta : %0.2f;eps : %0.2f;" % ( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt, tot_reward_nc, tot_reward_dist, agent.beta, agent.epsilon)) # print("t_obs : %0.2f;t_policy : %0.2f;t_step : %0.2f;t_mem : %0.2f;t_learn : %0.2f;t_reset : %0.2f" % ( # t_obs, # t_policy, # t_step, # t_mem, # t_learn, # t_reset,)) monitor.update( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) if (num_eps) % 10 == 0: agent.save( args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) agent.save( args.save, "m.{}_e.{}_n.{}.ep{}".format(args.model, args.env_name, args.name, num_eps // 100))