示例#1
0
	def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem):
		self.params = params
		self.param_set_id = param_set_id
		self.status_dict = status_dict
		self.shared_state = shared_state
		self.remote_mem = remote_mem

		gpu = 0
		torch.cuda.set_device(gpu)

		ep = params['env']
		ap = params['actor']
		lp = params['learner']
		rmp = params["replay_memory"]

		model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)'
		optimizer_formula = lp["optimizer"].format('self.Q.parameters()')

		self.conn = psycopg2.connect(params["db"]["connection_string"])
		self.conn.autocommit = True
		self.cur = self.conn.cursor()

		self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu")
		self.state_shape = ep['state_shape']
		self.batch_size = lp['replay_sample_size']
		self.action_dim = ep['action_dim']
		self.q_target_sync_freq = lp['q_target_sync_freq']
		self.num_q_updates = 0
		self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device)
		self.Q = eval(model_formula)
		self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q
		self.optimizer = eval(optimizer_formula)
		self.replay_memory = ReplayMemory(rmp)

		self.train_num = 0
		self.model_file_name = lp['load_saved_state']
		if self.model_file_name and os.path.isfile(self.model_file_name):
			print(f'Loading {self.model_file_name}')
			saved_state = torch.load(self.model_file_name)
			self.Q.load_state_dict(saved_state['module'])
			self.optimizer.load_state_dict(saved_state['optimizer'])
			self.train_num = saved_state['train_num']

		self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
		    self.Q_target.state_dict())
		self.status_dict['Q_state_dict_stored'] = True

		self.last_Q_state_dict_id = 1
		self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
		self.status_dict['train_num'] = self.train_num

		self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']
示例#2
0
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []
 def __init__(self, env, hparams):
     self.hparams = hparams
     self.env = env
     self.n = env.action_space.n
     self.Q = DCNN(4, self.n)
     self.T = DCNN(4, self.n)
     self.T.load_state_dict(self.Q.state_dict())
     self.T.eval()
     self.memory = ReplayMemory(hparams.memory_size)
     self.steps = 0
     self.state = env.reset()
     self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                          lr=hparams.lr,
                                          momentum=hparams.momentum)
     self.n_episodes = 0
def make_subset_buffer(buffer_path,
                       max_examples=100000,
                       frame_height=40,
                       frame_width=40):
    # keep max_examples < 100000 to enable knn search
    # states [top of image:bottom of image,:]
    # in breakout - can safely reduce size to be 80x80 of the given image
    # try to get an even number of each type of reward

    small_path = buffer_path.replace('.npz', '_%06d.npz' % max_examples)
    if os.path.exists(small_path):
        print('loading small buffer path')
        print(small_path)
        load_buffer = ReplayMemory(load_file=small_path)
    else:
        load_buffer = ReplayMemory(load_file=buffer_path)
        print('loading prescribed buffer path')
        print(buffer_path)

    # TODO if frame size is wrong - we arent handling
    if load_buffer.count > max_examples:
        print('creating small buffer')
        # actions for breakout:
        # ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
        sbuffer = ReplayMemory(
            max_examples,
            frame_height=frame_height,
            frame_width=frame_width,
            agent_history_length=load_buffer.agent_history_length)

        # remove ends because they are scary
        ends = np.where(load_buffer.terminal_flags == 1)[0][1:-1]
        random_state.shuffle(ends)
        for tidx in ends:
            if sbuffer.count >= max_examples:
                print('stopping after %s examples' % sbuffer.count)
                continue
            else:
                # start after the last terminal
                i = tidx + 1
                # while there isnt a new terminal flag
                while not load_buffer.terminal_flags[i + 1]:
                    frame = cv2.resize(load_buffer.frames[i][:, :, None],
                                       (frame_height, frame_width))
                    sbuffer.add_experience(
                        action=load_buffer.actions[i],
                        frame=frame,
                        reward=load_buffer.rewards[i],
                        terminal=load_buffer.terminal_flags[i])
                    i += 1
                    if not i % 100:
                        print(sbuffer.count)

        sbuffer.save_buffer(small_path)
        load_buffer = sbuffer
    assert load_buffer.count > 10
    return load_buffer, small_path
示例#5
0
    def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model,
                 target_update_interval=1,
                 tau=0.005):
        super(DDQN_separated_net, self).__init__(epsilon=epsilon,
                                                 random_can_stop=False)

        # Memory
        self.memory = ReplayMemory(memory_size)

        # Batch size when learning
        self.batch_size = batch_size

        # number of time steps before an update of the delayed target Q network
        self.target_update_interval = target_update_interval

        # soft update weight of the delayed Q network
        self.tau = tau
    def load_checkpoint(self, filepath, config_handler=''):
        # load previously saved state file
        fh = open(filepath, 'rb')
        fdict = pickle.load(fh)
        fh.close()
        if config_handler != '':
            # use given config handler
            del fdict['ch']
            self.ch = config_handler

        self.__dict__.update(fdict)

        self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble'])
        self.random_state = np.random.RandomState()
        self.random_state.set_state(fdict['state_random_state'])
        # TODO NOTE this does not restart at same env state
        self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase]
        self.env = self.ch.create_environment(self.seed)
        buffer_path = filepath.replace('.pkl', '.npz')
        self.memory_buffer = ReplayMemory(load_file=buffer_path)
        # TODO should you load the count from the memory buffer - ?
        # TODO what about episode number - it will be off now
        self.step_number = self.memory_buffer.count
        self.setup_eps()
 def create_empty_memory_buffer(self, seed, buffer_size):
     return ReplayMemory(
         size=buffer_size,
         frame_height=self.frame_height,
         frame_width=self.frame_width,
         agent_history_length=self.history_length,
         batch_size=self.cfg['DQN']['batch_size'],
         num_heads=self.cfg['DQN']['n_ensemble'],
         bernoulli_probability=self.cfg['DQN']['bernoulli_probability'],
         seed=seed,
         use_pred_frames=self.cfg['DQN']['use_pred_frames'],
         # details needed for online max pooling
         maxpool=self.maxpool,
         trim_before=trim_before,
         trim_after=trim_after,
         kernel_size=kernel_size,
         reduction_function=reduction_fn,
     )
示例#8
0
    def load_memory_buffer(self, phase, load_previously_saved=True):
        """
         phase: string should be "train" or "eval" to indicate which memory buffer to load

         function will load latest experience in the model_savedir/name or create a random replay buffer of specified size to start from
        """
        assert phase in ['train', 'eval']
        buffer_size = self.cfg['RUN']['%s_buffer_size' % phase]
        seed = self.cfg['RUN']['%s_seed' % phase]
        init_empty_with_random = self.cfg['DQN']['load_random_%s_buffer' %
                                                 phase]
        self.num_random_steps = self.cfg['DQN']['num_pure_random_steps_%s' %
                                                phase]
        if load_previously_saved:
            buffer_path = self.search_for_latest_replay_buffer(phase)
            if buffer_path != "":
                print("loading buffer from past experience:%s" % buffer_path)
                return ReplayMemory(load_file=buffer_path)
        if not init_empty_with_random:
            # no buffer file was found, and we want an empty buffer
            print("creating empty replay buffer")
            return self.create_empty_memory_buffer(seed, buffer_size)

        #####################################################
        # from here on - we assume we need random values
        # load a presaved random buffer if it is available
        #random_buffer_path = self.get_random_buffer_path(phase, seed)
        #if os.path.exists(random_buffer_path):
        #    print("loading random replay buffer:%s"%random_buffer_path)
        #    return ReplayMemory(load_file=random_buffer_path)
        #else:
        # no buffer file was found, and we want an empty buffer
        #print('did not find saved replay buffers')
        #print('cannot find a suitable random replay buffers... creating one - this will take some time')
        # did not find any checkpoints - load random buffer
        empty_memory_buffer = self.create_empty_memory_buffer(
            seed, buffer_size)
        #env = self.create_environment(seed)
        # save the random buffer
        #random_memory_buffer.save_buffer(random_buffer_path)
        return empty_memory_buffer
示例#9
0
def gen_fake(generator, agent, trainSample, batch_size, embed_dim, device, write_item, write_target, write_reward, write_action, action_num, max_length=5, recom_length=None):
    for stidx in range(0, trainSample.length(), batch_size):

        click_batch, length, _, reward_batch, action_batch = getBatch_dis(stidx, stidx + batch_size, trainSample, embed_dim, recom_length) 
        click_batch = click_batch.to(device)
        reward_batch = reward_batch.to(device)
        action_batch = action_batch.to(device)
        if recom_length == None:
            recom_length = action_batch.size(1)
        replay = ReplayMemory(generator, agent, len(length), max_length, action_num, recom_length)
        with torch.no_grad():
            replay.init_click_sample((click_batch, length), reward_batch, action_batch)
            replay.gen_sample(batch_size, False)
            seq_samples, lengths, seq_rewards, seq_actions = replay.clicks, replay.lengths, replay.tgt_rewards, replay.actions
            seq_rewards = torch.round(seq_rewards)
        write_tensor(seq_samples, lengths, write_item, write_target, 'dis', real=False)
        write_tensor_reward(seq_rewards, lengths, write_reward)
        write_tensor_action(seq_actions, lengths, write_action)
    return seq_samples, lengths, seq_rewards, seq_actions
示例#10
0
class Learner(object):

	def __init__(self, params, param_set_id, status_dict, shared_state, remote_mem):
		self.params = params
		self.param_set_id = param_set_id
		self.status_dict = status_dict
		self.shared_state = shared_state
		self.remote_mem = remote_mem

		gpu = 0
		torch.cuda.set_device(gpu)

		ep = params['env']
		ap = params['actor']
		lp = params['learner']
		rmp = params["replay_memory"]

		model_formula = f'model.{lp["model"]}(self.state_shape, self.action_dim).to(self.device)'
		optimizer_formula = lp["optimizer"].format('self.Q.parameters()')

		self.conn = psycopg2.connect(params["db"]["connection_string"])
		self.conn.autocommit = True
		self.cur = self.conn.cursor()

		self.device = torch.device("cuda:{}".format(gpu) if 0 <= gpu and torch.cuda.is_available() else "cpu")
		self.state_shape = ep['state_shape']
		self.batch_size = lp['replay_sample_size']
		self.action_dim = ep['action_dim']
		self.q_target_sync_freq = lp['q_target_sync_freq']
		self.num_q_updates = 0
		self.take_offsets = (torch.arange(self.batch_size) * self.action_dim).to(self.device)
		self.Q = eval(model_formula)
		self.Q_target = eval(model_formula) # Target Q network which is slow moving replica of self.Q
		self.optimizer = eval(optimizer_formula)
		self.replay_memory = ReplayMemory(rmp)

		self.train_num = 0
		self.model_file_name = lp['load_saved_state']
		if self.model_file_name and os.path.isfile(self.model_file_name):
			print(f'Loading {self.model_file_name}')
			saved_state = torch.load(self.model_file_name)
			self.Q.load_state_dict(saved_state['module'])
			self.optimizer.load_state_dict(saved_state['optimizer'])
			self.train_num = saved_state['train_num']

		self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
		    self.Q_target.state_dict())
		self.status_dict['Q_state_dict_stored'] = True

		self.last_Q_state_dict_id = 1
		self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
		self.status_dict['train_num'] = self.train_num

		self.gamma_n = params['actor']['gamma']**params['actor']['num_steps']

	def state_dict_to_cpu(self, state_dict):
		d = OrderedDict()
		for k, v in state_dict.items():
			d[k] = v.cpu()
		return d

	def add_experience_to_replay_mem(self):
		while self.remote_mem.qsize():
			priorities, batch = self.remote_mem.get()
			self.replay_memory.add(priorities, batch)

	def compute_loss_and_priorities(self, batch_size):
		indices, n_step_transition_batch, before_priorities = self.replay_memory.sample(batch_size)

		s = n_step_transition_batch[0].to(self.device)
		a = n_step_transition_batch[1].to(self.device)
		r = n_step_transition_batch[2].to(self.device)
		a_latest = n_step_transition_batch[3].to(self.device)
		s_latest = n_step_transition_batch[4].to(self.device)
		terminal = n_step_transition_batch[5].to(self.device)

		q = self.Q(s)
		q_a = q.take(self.take_offsets + a).squeeze()

		with torch.no_grad():
			self.Q_target.eval()
			Gt = r + (1.0 - terminal) * self.gamma_n * self.Q_target(s_latest).take(self.take_offsets + a_latest).squeeze()
			td_error = Gt - q_a

		loss = F.smooth_l1_loss(q_a, Gt)
		# loss = td_error**2 / 2

		# Compute the new priorities of the experience
		after_priorities = td_error.data.abs().cpu().numpy()
		self.replay_memory.set_priorities(indices, after_priorities)

		return loss, q, before_priorities, after_priorities, indices

	def update_Q(self, loss):
		self.optimizer.zero_grad()
		loss.backward()
		self.optimizer.step()
		self.num_q_updates += 1

		if self.num_q_updates % self.q_target_sync_freq == 0:
			self.Q_target.load_state_dict(self.Q.state_dict())
			print(f'Target Q synchronized.')
			return True
		else:
			return False

	def learn(self):
		t = tables.LearnerData()
		record_type = t.get_record_type()
		record_insert = t.get_insert()
		cur = self.cur
		param_set_id = self.param_set_id
		now = datetime.datetime.now
		step_num = 0
		target_sync_num = 0
		send_param_num = 0
		min_replay_mem_size = self.params['learner']["min_replay_mem_size"]

		print('learner waiting for replay memory.')
		while self.replay_memory.size() <= min_replay_mem_size:
			self.add_experience_to_replay_mem()
			time.sleep(0.01)
		step_num = 0
		print('learner start')
		while not self.status_dict['quit']:
			self.add_experience_to_replay_mem()
			# 4. Sample a prioritized batch of transitions
			# 5. & 7. Apply double-Q learning rule, compute loss and experience priorities
			# 8. Update priorities
			loss, q, before_priorities, after_priorities, indices = self.compute_loss_and_priorities(self.batch_size)
			if step_num % 10 == 0:
				print(f'loss : {loss}')
			#print("\nLearner: step_num=", step_num, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r')
			# 6. Update parameters of the Q network(s)
			if self.update_Q(loss):
				target_sync_num += 1
			if step_num % 5 == 0:
				self.shared_state['Q_state_dict'] = self.state_dict_to_cpu(self.Q.state_dict()), self.state_dict_to_cpu(
				    self.Q_target.state_dict())
				self.last_Q_state_dict_id += 1
				self.status_dict['Q_state_dict_id'] = self.last_Q_state_dict_id
				print('Send params to actors.')
				send_param_num += 1

			# 9. Periodically remove old experience from replay memory
			step_num += 1
			self.train_num += 1
			self.status_dict['train_num'] = self.train_num

			# DBへデータ登録
			r = record_type(param_set_id, now(), self.train_num,
			                step_num, loss.item(), q[0].tolist(), before_priorities.tolist(), after_priorities.tolist(),
			                indices.tolist(), target_sync_num, send_param_num)
			record_insert(cur, r)

		print('learner end')

		state_dict = {'module': self.Q.state_dict(), 'optimizer': self.optimizer.state_dict(), 'train_num': self.train_num}
		torch.save(state_dict, self.model_file_name)
示例#11
0
class StateManager():
    def __init__(self):
        self.reward_space = [-1, 0, 1]
        self.latent_representation_function = None
        pass

    def create_new_state_instance(self, config_handler, phase):
        self.ch = config_handler
        self.save_time = time.time() - 100000
        self.phase = phase
        self.step_number = 0
        self.end_step_number = -1
        self.episode_number = 0
        self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase]
        self.random_state = np.random.RandomState(self.seed)
        self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble'])
        self.episodic_reward = []
        self.episodic_reward_avg = []
        self.episodic_step_count = []
        self.episodic_step_ends = []
        self.episodic_loss = []
        self.episodic_times = []
        self.episodic_eps = []

        self.env = self.ch.create_environment(self.seed)
        self.memory_buffer = self.ch.load_memory_buffer(self.phase)
        # TODO should you load the count from the memory buffer - ?
        self.step_number = self.memory_buffer.count
        self.setup_eps()

    def setup_eps(self):
        if self.phase == 'train':
            self.eps_init = self.ch.cfg['DQN']['eps_init']
            self.eps_final = self.ch.cfg['DQN']['eps_final']
            self.eps_annealing_steps = self.ch.cfg['DQN'][
                'eps_annealing_steps']
            self.last_annealing_step = self.eps_annealing_steps + self.ch.cfg[
                'DQN']['num_pure_random_steps_train']
            if self.eps_annealing_steps > 0:
                self.slope = -(self.eps_init -
                               self.eps_final) / self.eps_annealing_steps
                self.intercept = self.eps_init - self.slope * self.ch.cfg[
                    'DQN']['num_pure_random_steps_train']

    def load_checkpoint(self, filepath, config_handler=''):
        # load previously saved state file
        fh = open(filepath, 'rb')
        fdict = pickle.load(fh)
        fh.close()
        if config_handler != '':
            # use given config handler
            del fdict['ch']
            self.ch = config_handler

        self.__dict__.update(fdict)

        self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble'])
        self.random_state = np.random.RandomState()
        self.random_state.set_state(fdict['state_random_state'])
        # TODO NOTE this does not restart at same env state
        self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase]
        self.env = self.ch.create_environment(self.seed)
        buffer_path = filepath.replace('.pkl', '.npz')
        self.memory_buffer = ReplayMemory(load_file=buffer_path)
        # TODO should you load the count from the memory buffer - ?
        # TODO what about episode number - it will be off now
        self.step_number = self.memory_buffer.count
        self.setup_eps()

    def save_checkpoint(self, checkpoint_basepath):
        # pass in step number because we always want to use training step number as reference
        self.save_time = time.time()
        self.plot_progress(checkpoint_basepath)
        # TODO save this class - except for random state i assume
        self.memory_buffer.save_buffer(checkpoint_basepath + '.npz')
        # TOO big - prob need to save specifics
        ## preserve random state -
        self.state_random_state = self.random_state.get_state()
        save_dict = {
            'episodic_reward': self.episodic_reward,
            'episodic_reward_avg': self.episodic_reward_avg,
            'episodic_step_count': self.episodic_step_count,
            'episodic_step_ends': self.episodic_step_ends,
            'episodic_loss': self.episodic_loss,
            'episodic_times': self.episodic_times,
            'state_random_state': self.state_random_state,
            'episode_number': self.episode_number,
            'step_number': self.step_number,
            'phase': self.phase,
            'save_time': self.save_time,
            'ch': self.ch,
            'episodic_eps': self.episodic_eps,
        }
        fh = open(checkpoint_basepath + '.pkl', 'wb')
        pickle.dump(save_dict, fh)
        fh.close()
        print('finished pickle in', time.time() - self.save_time)

    def end_episode(self):
        # catalog
        self.end_time = time.time()
        self.end_step_number = deepcopy(self.step_number)
        # add to lists
        self.episodic_reward.append(np.sum(self.episode_rewards))
        self.episodic_step_count.append(self.end_step_number -
                                        self.start_step_number)
        self.episodic_step_ends.append(self.end_step_number)
        self.episodic_loss.append(np.mean(self.episode_losses))
        self.episodic_times.append(self.end_time - self.start_time)
        try:
            self.episodic_eps.append(self.eps)
        except:
            self.episodic_eps = [1.0 for x in range(len(self.episodic_times))]
        # smoothed reward over last 100 episodes
        self.episodic_reward_avg.append(
            np.mean(
                self.
                episodic_reward[-self.ch.cfg['PLOT']['num_prev_steps_avg']:]))
        num_steps = self.episodic_step_count[-1]
        print("*** %s E%05d S%010d AH%s-R%s num random/total steps:%s/%s***" %
              (self.phase, self.episode_number, self.step_number,
               self.active_head, self.episodic_reward[-1],
               self.num_random_steps, num_steps))
        self.episode_active = False
        self.episode_number += 1

    def start_episode(self):
        self.start_time = time.time()
        self.random_state.shuffle(self.heads)
        self.active_head = self.heads[0]
        self.end_step_number = -1

        self.episode_losses = []
        self.episode_actions = []
        self.episode_rewards = []
        self.start_step_number = deepcopy(self.step_number)
        self.num_random_steps = 0

        # restart counters
        self.terminal = False
        self.life_lost = True
        self.episode_reward = 0

        state = self.env.reset()
        self.prev_action = 0
        self.prev_reward = 0
        for i in range(state.shape[0] + 1):
            # add enough memories to use the memory buffer
            # not sure if this is correct
            self.memory_buffer.add_experience(
                action=0,
                frame=state[
                    -1],  # use last frame in state bc it is only nonzero one
                reward=0,
                terminal=0,
                end=0,
            )

        # get correctly formatted last state
        batch = self.memory_buffer.get_history_minibatch(indices='last')
        # get state
        self.state = batch[0][0]
        if self.state.shape != (self.ch.num_prev_steps,
                                self.memory_buffer.agent_history_length,
                                self.memory_buffer.frame_height,
                                self.memory_buffer.frame_width):
            print("start shape wrong")
            embed()
        self.episode_active = True
        return self.state

    def plot_current_episode(self, plot_basepath=''):
        if plot_basepath == '':
            plot_basepath = self.get_plot_basepath()
        plot_dict = {
            'mean loss': self.episode_losses,
            'actions': self.episode_actions,
            'rewards': self.episode_rewards,
        }
        suptitle = 'E%s S%s-%s R%s' % (
            self.episode_number, self.start_step_number, self.end_step_number,
            self.episodic_reward[-1])
        plot_path = plot_basepath + '_ep%06d.png' % self.episode_number
        #step_range = np.arange(self.start_step_number, self.end_step_number)
        #self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps', xdata=step_range)
        self.plot_data(plot_path, plot_dict, suptitle,
                       xname='episode steps')  #, xdata=step_range)
        ep_steps = self.end_step_number - self.start_step_number
        self.plot_histogram(plot_basepath +
                            '_ep_histrewards_%06d.png' % self.episode_number,
                            data=self.episode_rewards,
                            bins=self.reward_space,
                            title='rewards TR%s' % self.episode_reward)
        self.plot_histogram(
            plot_basepath + '_ep_histactions_%06d.png' % self.episode_number,
            data=self.episode_actions,
            bins=self.env.action_space,
            title='actions acthead:%s nrand:%s/%s' %
            (self.active_head, self.num_random_steps, ep_steps))

    def plot_last_episode(self):
        ep_steps = self.end_step_number - self.start_step_number
        ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes = self.memory_buffer.get_last_n_states(
            ep_steps)
        plot_basepath = self.get_plot_basepath() + '_episode_states_frames'
        self.plot_episode_movie(plot_basepath, ep_states, ep_actions,
                                ep_rewards, ep_next_states, ep_terminals,
                                ep_masks, indexes)

    def plot_episode_movie(self, plot_basepath, states, actions, rewards,
                           next_states, terminals, masks, indexes):
        if not os.path.exists(plot_basepath):
            os.makedirs(plot_basepath)
        n_steps = states.shape[0]
        print('plotting episode of length %s' % n_steps)
        if self.latent_representation_function == None:
            n_cols = 2
        else:
            pred_next_states, zs, latents = self.latent_representation_function(
                states, actions, rewards, self.ch)
            n_cols = 4
        latent_image_path = os.path.join(plot_basepath, 'latent_step_%05d.png')
        ep_reward = sum(rewards)
        movie_path = plot_basepath + '_movie_R%04d.mp4' % ep_reward

        print('starting to make movie', movie_path)
        # write frame by frame then use ffmpeg to generate movie
        #image_path = os.path.join(plot_basepath, 'step_%05d.png')
        #w_path = plot_basepath+'_write_movie_R%04d.sh'%ep_reward
        #a = open(w_path, 'w')
        #cmd = "ffmpeg -n -r 30 -i %s -c:v libx264 -pix_fmt yuv420p %s"%(os.path.abspath(image_path),os.path.abspath(movie_path))
        #a.write(cmd)
        #a.close()
        #w,h = states[0,3].shape
        #treward = 0
        #for step in range(min(n_steps, 100)):
        #    f, ax = plt.subplots(1, n_cols)
        #    if not step%20:
        #        print('plotting step', step)
        #    ax[0].imshow(states[step, 3], cmap=plt.cm.gray)
        #    #ax[0].set_title('OS-A%s' %(actions[step]))
        #    ax[1].imshow(next_states[step, 3], cmap=plt.cm.gray)
        #    treward+=rewards[step]
        #    if self.latent_representation_function != None:
        #        ax[2].imshow(pred_next_states[step], cmap=plt.cm.gray)
        #        z = np.hstack((zs[step,0], zs[step,1], zs[step,2]))
        #        ax[3].imshow(z)
        #    for aa in range(n_cols):
        #        ax[aa].set_xticks([])
        #        ax[aa].set_yticks([])
        #    f.suptitle('%sA%sR%sT%sD%s'%(step, actions[step], rewards[step], treward, int(terminals[step])))
        #    plt.savefig(image_path%step)
        #    plt.close()

        # generate movie directly
        max_frames = 5000
        n = min(n_steps, max_frames)
        for step in range(n):
            if self.latent_representation_function != None:
                z = np.hstack((zs[step, 0], zs[step, 1], zs[step, 2]))
                zo = resize(z, (84, 84), cval=0, order=0)
                # TODO - is imwrite clipping zo since it is not a uint8?
                img = np.hstack(
                    (states[step,
                            3], next_states[step,
                                            3], pred_next_states[step], zo))
            else:
                img = np.hstack((states[step, 3], next_states[step, 3]))

            if not step:
                movie = np.zeros((n, img.shape[0], img.shape[1]))
                latent_movie = np.zeros((n, z.shape[0], z.shape[1]))
            movie[step] = img
            latent_movie[step] = z
        vwrite(movie_path, movie)

    def plot_histogram(self, plot_path, data, bins, title=''):
        n, bins, _ = plt.hist(data, bins=bins)
        plt.xticks(bins, bins)
        plt.yticks(n, n)
        plt.xlim(min(bins), max(bins) + 1)
        plt.title(title)
        plt.savefig(plot_path)
        plt.close()

    def plot_progress(self, plot_basepath=''):
        if plot_basepath == '':
            plot_basepath = self.get_plot_basepath()
        det_plot_dict = {
            'episodic step count': self.episodic_step_count,
            'episodic time': self.episodic_times,
            'mean episodic loss': self.episodic_loss,
            'eps': self.episodic_eps,
        }

        suptitle = 'Details E%s S%s' % (self.episode_number,
                                        self.end_step_number)
        edet_plot_path = plot_basepath + '_details_episodes.png'
        sdet_plot_path = plot_basepath + '_details_episodes.png'
        if self.end_step_number > 1:
            #exdata = np.arange(self.episode_number)
            #self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode', xdata=exdata)
            #self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends)
            self.plot_data(edet_plot_path,
                           det_plot_dict,
                           suptitle,
                           xname='episode')  #, xdata=exdata)
            self.plot_data(sdet_plot_path,
                           det_plot_dict,
                           suptitle,
                           xname='steps',
                           xdata=self.episodic_step_ends)

            rew_plot_dict = {
                'episodic reward': self.episodic_reward,
                'smooth episodic reward': self.episodic_reward_avg,
            }

            suptitle = 'Reward E%s S%s R%s' % (self.episode_number,
                                               self.end_step_number,
                                               self.episodic_reward[-1])
            erew_plot_path = plot_basepath + '_reward_episodes.png'
            srew_plot_path = plot_basepath + '_reward_steps.png'
            #self.plot_data(erew_plot_path, rew_plot_dict, suptitle, xname='episode', xdata=np.arange(self.episode_number))
            #self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends)
            self.plot_data(
                erew_plot_path, rew_plot_dict, suptitle,
                xname='episode')  #, xdata=np.arange(self.episode_number))
            self.plot_data(srew_plot_path,
                           rew_plot_dict,
                           suptitle,
                           xname='steps',
                           xdata=self.episodic_step_ends)

    def plot_data(self, savepath, plot_dict, suptitle, xname, xdata=None):
        st = time.time()
        print('starting plot data')
        n = len(plot_dict.keys())
        f, ax = plt.subplots(n, 1, figsize=(6, 3 * n))
        #f,ax = plt.subplots(n,1)
        try:
            for xx, name in enumerate(sorted(plot_dict.keys())):
                if xdata is not None:
                    ax[xx].plot(xdata, plot_dict[name])
                else:
                    ax[xx].plot(plot_dict[name])
                ax[xx].set_title('%s' % (name))
                ax[xx].set_ylabel(name)
                print(name, xname, st - time.time())
            ax[xx].set_xlabel(xname)
            f.suptitle('%s %s' % (self.phase, suptitle))
            print('end sup', st - time.time())
            f.savefig(savepath)
            print("saved: %s" % savepath)
            plt.close()
            print('finished')
        except Exception:
            print("plot")
            embed()

    def get_plot_basepath(self):
        return self.ch.get_checkpoint_basepath(
            self.step_number) + '_%s' % self.phase

    def handle_plotting(self, plot_basepath='', force_plot=False):
        # will plot at beginning of episode
        #if not self.episode_number % self.ch.cfg['PLOT']['plot_episode_every_%s_episodes'%self.phase]:
        # dont plot first episode
        plot_basepath = self.get_plot_basepath()
        if self.episode_number:
            if force_plot:
                self.plot_current_episode(plot_basepath)
                self.plot_progress(plot_basepath)
            if self.episode_number == 1 or not self.episode_number % self.ch.cfg[
                    'PLOT']['plot_episode_every_%s_episodes' % self.phase]:
                self.plot_current_episode(plot_basepath)
            if self.episode_number == 1 or not self.episode_number % self.ch.cfg[
                    'PLOT']['plot_every_%s_episodes' % self.phase]:
                self.plot_progress(plot_basepath)

    def step(self, action):
        next_state, reward, self.life_lost, self.terminal = self.env.step(
            action)
        self.prev_action = action
        self.prev_reward = np.sign(reward)
        # the replay buffer will convert the observed state as needed
        self.memory_buffer.add_experience(
            action=action,
            frame=next_state[-1],
            reward=self.prev_reward,
            terminal=self.life_lost,
            end=self.terminal,
        )
        self.episode_actions.append(self.prev_action)
        self.episode_rewards.append(self.prev_reward)
        self.step_number += 1
        batch = self.memory_buffer.get_history_minibatch(indices='last')
        # get state
        self.state = batch[0][0]
        #self.state = self.memory_buffer.get_last_state()
        if self.state.shape[1] == 0:
            print('handler state chan 0')
            embed()

    def set_eps(self):
        # TODO function to find eps - for now use constant
        if self.step_number <= self.ch.cfg['DQN']['num_pure_random_steps_%s' %
                                                  self.phase]:
            self.eps = 1.0
        if self.phase == 'train':
            self.eps = self.eps_final
            if self.step_number < self.last_annealing_step:
                self.eps = self.slope * self.step_number + self.intercept
        else:
            self.eps = self.ch.cfg['EVAL']['eps_eval']

    def random_action(self):
        self.num_random_steps += 1
        # pass action_idx to env.action_space
        return self.random_state.choice(range(self.env.num_actions))

    def is_random_action(self):
        self.set_eps()
        r = self.random_state.rand()
        if r < self.eps:
            return True
        else:
            return False
示例#12
0
class DQN:
    def __init__(self, env, hparams):
        self.hparams = hparams
        self.env = env
        self.n = env.action_space.n
        self.Q = DCNN(4, self.n)
        self.T = DCNN(4, self.n)
        self.T.load_state_dict(self.Q.state_dict())
        self.T.eval()
        self.memory = ReplayMemory(hparams.memory_size)
        self.steps = 0
        self.state = env.reset()
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=hparams.lr,
                                             momentum=hparams.momentum)
        self.n_episodes = 0

    @torch.no_grad()
    def select_action(self):
        hparams = self.hparams
        start = hparams.eps_start
        end = hparams.eps_end
        time = hparams.eps_time
        steps = self.steps
        self.steps += 1
        if steps < time:
            epsilon = start - (start - end) * steps / time
        else:
            epsilon = end

        sample = random.random()

        if sample > epsilon:
            return self.Q(s2t(self.state).to(device)).max(1)[1].item()
        else:
            return self.env.action_space.sample()

    def sample_step(self, fs_min=2, fs_max=6):
        """repeats a single action between fs_min and fs_max (inclusive) times"""
        fs = random.randint(fs_min, fs_max)
        action = self.select_action()
        r = 0
        for _ in range(fs):
            new_state, reward, done, _ = self.env.step(action)
            self.memory.push(self.state, action,
                             new_state if not done else None, reward)
            r += reward
            self.state = self.env.reset() if done else new_state
            if done:
                self.n_episodes += 1
        return r

    def optimize(self):
        hparams = self.hparams
        transitions = self.memory.sample(hparams.batch_size)
        batch = Transition(*zip(*transitions))
        states = torch.cat([s2t(state) for state in batch.state]).to(device)
        actions = torch.tensor(batch.action).unsqueeze(1).to(device)
        target_values = torch.tensor(
            batch.reward).unsqueeze(1).to(device).float()
        non_terminal_next_states = torch.cat([
            s2t(state) for state in batch.next_state if state is not None
        ]).to(device)
        non_terminal_mask = torch.tensor([
            state is not None for state in batch.next_state
        ]).to(device).unsqueeze(1)

        values = self.Q(states).gather(1, actions).float()
        target_values[non_terminal_mask] += hparams.gamma * self.T(
            non_terminal_next_states).detach().max(1)[0].float()

        #print(values.dtype,target_values.dtype)
        loss = F.smooth_l1_loss(values, target_values)
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.Q.parameters():
            param.grad.data.clamp_(-1, 1)  # maybe try sign_?

        self.optimizer.step()
        return loss
示例#13
0
            </Mission>'''

# Create default Malmo objects:
agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse(sys.argv)
except RuntimeError as e:
    print('ERROR:', e)
    print(agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print(agent_host.getUsage())
    exit(0)

net = cnn.Net()
memory = ReplayMemory(1000)

for episode in range(5):
    my_mission = MalmoPython.MissionSpec(missionXML, True)
    my_mission_record = MalmoPython.MissionRecordSpec()

    # Attempt to start a mission:
    max_retries = 3
    for retry in range(max_retries):
        try:
            agent_host.startMission(my_mission, my_mission_record)
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print("Error starting mission:", e)
                exit(1)
示例#14
0
def train(sess, environment, actor, critic, embeddings, history_length, ra_length, buffer_size, batch_size,
	  discount_factor, nb_episodes, filename_summary, nb_rounds, **env_args):
	''' Algorithm 3 in article. '''

	# Set up summary operators
	def build_summaries():
		episode_reward = tf.Variable (0.)
		tf.summary.scalar ('reward', episode_reward)
		episode_max_Q = tf.Variable (0.)
		tf.summary.scalar ('max_Q_value', episode_max_Q)
		critic_loss = tf.Variable (0.)
		tf.summary.scalar ('critic_loss', critic_loss)

		summary_vars = [episode_reward, episode_max_Q, critic_loss]
		summary_ops = tf.summary.merge_all ()
		return summary_ops, summary_vars

	summary_ops, summary_vars = build_summaries ()
	sess.run (tf.global_variables_initializer ())
	writer = tf.summary.FileWriter (filename_summary, sess.graph)

	# '2: Initialize target network f′ and Q′'
	actor.init_target_network ()
	critic.init_target_network ()

	# '3: Initialize the capacity of replay memory D'
	replay_memory = ReplayMemory(buffer_size)  # Memory D in article
	replay = False

	start_time = time.time ()
	for i_session in range (nb_episodes):  # '4: for session = 1, M do'
		session_reward = 0
		session_Q_value = 0
		session_critic_loss = 0

		# '5: Reset the item space I' is useless because unchanged.
		nb_env = 10
		envs = np.asarray([Environment(**env_args) for i in range(nb_env)])
# 		u = [e.current_user for e in envs]
# 		print(u)
# 		input()
		states = np.array([env.current_state for env in envs])  # '6: Initialize state s_0 from previous sessions'
		
	#         if (i_session + 1) % 10 == 0:  # Update average parameters every 10 episodes
	#             environment.groups = environment.get_groups ()

		exploration_noise = OrnsteinUhlenbeckNoise (history_length * embeddings.size ())

		for t in range (nb_rounds):  # '7: for t = 1, T do'
			# '8: Stage 1: Transition Generating Stage'

			# '9: Select an action a_t = {a_t^1, ..., a_t^K} according to Algorithm 2'
			actions, item_idxes = actor.get_recommendation_list (
				ra_length,
				states.reshape (nb_env, -1),  # TODO + exploration_noise.get().reshape(1, -1),
				embeddings)

			# '10: Execute action a_t and observe the reward list {r_t^1, ..., r_t^K} for each item in a_t'
			for env, state, action, items in zip(envs, states, actions, item_idxes):
				sim_results, rewards, next_state = env.step (action, items)

				# '19: Store transition (s_t, a_t, r_t, s_t+1) in D'
				replay_memory.add (state.reshape (history_length * embeddings.size ()),
								   action.reshape (ra_length * embeddings.size ()),
								   [rewards],
								   next_state.reshape (history_length * embeddings.size ()))

				state = next_state  # '20: Set s_t = s_t+1'

				session_reward += rewards

			# '21: Stage 2: Parameter Updating Stage'
			if replay_memory.size () >= batch_size * nb_env:  # Experience replay
				replay = True
				replay_Q_value, critic_loss = experience_replay (replay_memory, batch_size,
																 actor, critic, embeddings, ra_length,
																 history_length * embeddings.size (),
																 ra_length * embeddings.size (), discount_factor)
				session_Q_value += replay_Q_value
				session_critic_loss += critic_loss

			summary_str = sess.run (summary_ops,
									feed_dict = {summary_vars[0]: session_reward,
												 summary_vars[1]: session_Q_value,
												 summary_vars[2]: session_critic_loss})

			writer.add_summary (summary_str, i_session)

			'''
			print(state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings),
				  state_to_items(embeddings.embed(data['state'][0]), actor, ra_length, embeddings, True))
			'''

		str_loss = str ('Loss=%0.4f' % session_critic_loss)
		print (('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time () - start_time))
		start_time = time.time ()

	writer.close ()
	tf.train.Saver ().save (sess, 'models.h5', write_meta_graph = False)
示例#15
0
def init_train():
    """ use args to setup inplace training """
    train_data_path = args.train_buffer
    valid_data_path = args.valid_buffer

    data_dir = os.path.split(train_data_path)[0]

    # we are starting from scratch training this model
    if args.model_loadpath == "":
        run_num = 0
        model_base_filedir = os.path.join(data_dir,
                                          args.savename + '%02d' % run_num)
        while os.path.exists(model_base_filedir):
            run_num += 1
            model_base_filedir = os.path.join(data_dir,
                                              args.savename + '%02d' % run_num)
        os.makedirs(model_base_filedir)
        model_base_filepath = os.path.join(model_base_filedir, args.savename)
        print("MODEL BASE FILEPATH", model_base_filepath)

        info = {
            'model_train_cnts': [],
            'model_train_losses': {},
            'model_valid_cnts': [],
            'model_valid_losses': {},
            'model_save_times': [],
            'model_last_save': 0,
            'model_last_plot': 0,
            'NORM_BY': 255.0,
            'MODEL_BASE_FILEDIR': model_base_filedir,
            'model_base_filepath': model_base_filepath,
            'model_train_data_file': train_data_path,
            'model_valid_data_file': valid_data_path,
            'NUM_TRAINING_EXAMPLES': args.num_training_examples,
            'NUM_K': args.num_k,
            'NR_LOGISTIC_MIX': args.nr_logistic_mix,
            'NUM_PCNN_FILTERS': args.num_pcnn_filters,
            'NUM_PCNN_LAYERS': args.num_pcnn_layers,
            'ALPHA_REC': args.alpha_rec,
            'ALPHA_ACT': args.alpha_act,
            'ALPHA_REW': args.alpha_rew,
            'MODEL_BATCH_SIZE': args.batch_size,
            'NUMBER_CONDITION': args.num_condition,
            'CODE_LENGTH': args.code_length,
            'NUM_MIXTURES': args.num_mixtures,
            'REQUIRE_UNIQUE_CODES': args.require_unique_codes,
        }

        ## size of latents flattened - dependent on architecture
        #info['float_condition_size'] = 100*args.num_z
        ## 3x logistic needed for loss
        ## TODO - change loss
    else:
        print('loading model from: %s' % args.model_loadpath)
        model_dict = torch.load(args.model_loadpath,
                                map_location=lambda storage, loc: storage)
        info = model_dict['model_info']
        model_base_filedir = os.path.split(args.model_loadpath)[0]
        model_base_filepath = os.path.join(model_base_filedir, args.savename)
        info['loaded_from'] = args.model_loadpath
        info['MODEL_BATCH_SIZE'] = args.batch_size
    info['DEVICE'] = DEVICE
    info['MODEL_SAVE_EVERY'] = args.save_every
    info['MODEL_LOG_EVERY_BATCHES'] = args.log_every_batches
    info['model_loadpath'] = args.model_loadpath
    info['MODEL_SAVENAME'] = args.savename
    info['MODEL_LEARNING_RATE'] = args.learning_rate
    # create replay buffer
    train_buffer = make_subset_buffer(
        train_data_path, max_examples=info['NUM_TRAINING_EXAMPLES'])
    valid_buffer = make_subset_buffer(valid_data_path,
                                      max_examples=int(
                                          info['NUM_TRAINING_EXAMPLES'] * .1))
    valid_buffer = ReplayMemory(load_file=valid_data_path)
    # if train buffer is too large - make random subset
    # 27588 places in 1e6 buffer where reward is nonzero

    info['num_actions'] = train_buffer.num_actions()
    info['size_training_set'] = train_buffer.num_examples()
    info['hsize'] = train_buffer.frame_height
    info['wsize'] = train_buffer.frame_width
    info['num_rewards'] = train_buffer.num_rewards()
    info['HISTORY_SIZE'] = 4

    rewards_weight = 1 - np.array(train_buffer.percentages_rewards())
    actions_weight = 1 - np.array(train_buffer.percentages_actions())
    actions_weight = torch.FloatTensor(actions_weight).to(DEVICE)
    rewards_weight = torch.FloatTensor(rewards_weight).to(DEVICE)
    info['actions_weight'] = actions_weight
    info['rewards_weight'] = rewards_weight

    # output mixtures should be 2*nr_logistic_mix + nr_logistic mix for each
    # decorelated channel
    info['num_output_mixtures'] = (2 * args.nr_logistic_mix +
                                   args.nr_logistic_mix) * info['HISTORY_SIZE']
    nmix = int(info['num_output_mixtures'] / info['HISTORY_SIZE'])
    info['nmix'] = nmix
    #encoder_model = ConvVAE(info['CODE_LENGTH'], input_size=args.num_condition,
    #                        encoder_output_size=args.encoder_output_size,
    #                        num_output_channels=nmix,
    #                         ).to(DEVICE)
    encoder_model = ConvVAE(info['CODE_LENGTH'],
                            input_size=args.num_condition,
                            encoder_output_size=args.encoder_output_size,
                            num_output_channels=1).to(DEVICE)
    prior_model = PriorNetwork(
        size_training_set=info['NUM_TRAINING_EXAMPLES'],
        code_length=info['CODE_LENGTH'],
        n_mixtures=info['NUM_MIXTURES'],
        k=info['NUM_K'],
        require_unique_codes=info['REQUIRE_UNIQUE_CODES'],
    ).to(DEVICE)
    pcnn_decoder = GatedPixelCNN(input_dim=1,
                                 dim=info['NUM_PCNN_FILTERS'],
                                 n_layers=info['NUM_PCNN_LAYERS'],
                                 n_classes=info['num_actions'],
                                 float_condition_size=info['CODE_LENGTH'],
                                 last_layer_bias=0.5,
                                 hsize=info['hsize'],
                                 wsize=info['wsize']).to(DEVICE)

    parameters = list(encoder_model.parameters()) + list(
        prior_model.parameters()) + list(pcnn_decoder.parameters())
    parameters = list(encoder_model.parameters()) + list(
        prior_model.parameters())
    opt = optim.Adam(parameters, lr=info['MODEL_LEARNING_RATE'])

    if args.model_loadpath != '':
        print("loading weights from:%s" % args.model_loadpath)
        encoder_model.load_state_dict(model_dict['encoder_model_state_dict'])
        prior_model.load_state_dict(model_dict['prior_model_state_dict'])
        pcnn_decoder.load_state_dict(model_dict['pcnn_decoder_state_dict'])
        #encoder_model.embedding = model_dict['model_embedding']
        opt.load_state_dict(model_dict['opt_state_dict'])

    model_dict = {
        'encoder_model': encoder_model,
        'prior_model': prior_model,
        'pcnn_decoder': pcnn_decoder,
        'opt': opt
    }
    data_buffers = {'train': train_buffer, 'valid': valid_buffer}
    if args.sample:
        sample_acn(info,
                   model_dict,
                   data_buffers,
                   num_samples=args.num_samples,
                   teacher_force=args.teacher_force)
    else:
        train_acn(info, model_dict, data_buffers)
示例#16
0
class Agent:
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self, state):
        self.steps_done += 1
        sample = random.random()
        eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \
            math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY)
        if sample < eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # action = self.policy_net(state).max(1)[1]
                action = self.policy_net(state).argmax() % self.n_actions
        else:
            action = random.randrange(self.n_actions)
        return torch.tensor([[action]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.cfg.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.cfg.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def step(self, i_episode):
        # Initialize the environment and state
        self.env.reset()
        last_screen = self.env_w.get_screen()
        current_screen = self.env_w.get_screen()
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            obs, reward, done, obs_ = self.env.step(action.item())
            # reward = torch.tensor([reward], device=self.device)
            reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float)

            # Observe new state
            last_screen = current_screen
            current_screen = self.env_w.get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                self.episode_durations.append(t + 1)
                self.env_w.plot_durations(self.episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % self.cfg.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
示例#17
0
memoryCapacity = 10000
numEpisodes = 10000
maxStepPerEpisode = 2000
learningRate = 0.03

# rendering
render = True
renderStepDuration = 50
renderEpisodeDuration = 20

# initialization
policyNet = TetrisDQN()
targetNet = TetrisDQN()
policyNet.to(device)
targetNet.to(device)
memory = ReplayMemory(memoryCapacity)
tetris = Tetris()
optimizer = optim.RMSprop(policyNet.parameters(), lr=learningRate)

numSteps = 0
bestEpisodeReward = -1000
done = True

# save results
currentTime = datetime.datetime.now()
timeString = currentTime.strftime("%Y-%m-%d-%H-%M-%S")
if not os.path.exists("results"):
    os.mkdir("results")
directory = "results/" + timeString + "/"
os.mkdir(directory)
configFile = open(directory + "config.txt", "w")
    # create environment
    env = Environment(rom_file=info['GAME'],
                      frame_skip=info['FRAME_SKIP'],
                      num_frames=info['HISTORY_SIZE'],
                      no_op_start=info['MAX_NO_OP_FRAMES'],
                      rand_seed=info['SEED'],
                      dead_as_end=info['DEAD_AS_END'],
                      max_episode_steps=info['MAX_EPISODE_STEPS'])

    # create replay buffer
    replay_memory = ReplayMemory(
        action_space=env.action_space,
        size=info['BUFFER_SIZE'],
        frame_height=info['OBS_SIZE'][0],
        frame_width=info['OBS_SIZE'][1],
        agent_history_length=info['HISTORY_SIZE'],
        batch_size=info['BATCH_SIZE'],
        num_heads=info['N_ENSEMBLE'],
        bernoulli_probability=info['BERNOULLI_PROBABILITY'],
        latent_frame_height=info['LATENT_SIZE'],
        latent_frame_width=info['LATENT_SIZE'])
    # latent_replay_memory = ReplayMemory(action_space=env.action_space,
    #                              size=info['BUFFER_SIZE'],
    #                              frame_height=info['LATENT_SIZE'],
    #                              frame_width=info['LATENT_SIZE'],
    #                              agent_history_length=info['HISTORY_SIZE'],
    #                              batch_size=info['BATCH_SIZE'],
    #                              num_heads=info['N_ENSEMBLE'],
    #                              bernoulli_probability=info['BERNOULLI_PROBABILITY'])

    random_state = np.random.RandomState(info["SEED"])
def train_dqn(env,
              num_steps,
              *,
              replay_size,
              batch_size,
              exploration,
              gamma,
              train_freq=1,
              print_freq=100,
              target_network_update_freq=500,
              t_learning_start=1000):
    """
    DQN algorithm.

    Compared to previous training procedures, we will train for a given number
    of time-steps rather than a given number of episodes.  The number of
    time-steps will be in the range of millions, which still results in many
    episodes being executed.

    Args:
        - env: The openai Gym environment
        - num_steps: Total number of steps to be used for training
        - replay_size: Maximum size of the ReplayMemory
        - batch_size: Number of experiences in a batch
        - exploration: a ExponentialSchedule
        - gamma: The discount factor

    Returns: (saved_models, returns)
        - saved_models: Dictionary whose values are trained DQN models
        - returns: Numpy array containing the return of each training episode
        - lengths: Numpy array containing the length of each training episode
        - losses: Numpy array containing the loss of each training batch
    """
    # check that environment states are compatible with our DQN representation
    assert (isinstance(env.observation_space, gym.spaces.Box)
            and len(env.observation_space.shape) == 1)

    # get the state_size from the environment
    state_size = env.observation_space.shape[0]

    # initialize the DQN and DQN-target models
    dqn_model = DQN(state_size, env.action_space.n)
    dqn_target = DQN.custom_load(dqn_model.custom_dump())

    # initialize the optimizer
    optimizer = torch.optim.Adam(dqn_model.parameters(), lr=5e-4)

    # initialize the replay memory
    memory = ReplayMemory(replay_size, state_size)

    # initiate lists to store returns, lengths and losses
    rewards = []
    returns = []
    lengths = []
    losses = []

    last_100_returns = deque(maxlen=100)
    last_100_lengths = deque(maxlen=100)

    # initiate structures to store the models at different stages of training
    saved_models = {}

    i_episode = 0
    t_episode = 0

    state = env.reset()

    # iterate for a total of `num_steps` steps
    for t_total in range(num_steps):
        # use t_total to indicate the time-step from the beginning of training

        if t_total >= t_learning_start:
            eps = exploration.value(t_total - t_learning_start)
        else:
            eps = 1.0
        action = select_action_epsilon_greedy(dqn_model, state, eps, env)
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)

        rewards.append(reward)
        state = next_state

        if t_total >= t_learning_start and t_total % train_freq == 0:
            batch = memory.sample(batch_size)
            loss = train_dqn_batch(optimizer, batch, dqn_model, dqn_target,
                                   gamma)
            losses.append(loss)

        # update target network
        if t_total >= t_learning_start and t_total % target_network_update_freq == 0:
            dqn_target.load_state_dict(dqn_model.state_dict())

        if done:

            # Calculate episode returns
            G = 0
            for i in range(len(rewards)):
                G += rewards[i] * pow(gamma, i)

            # Collect results
            lengths.append(t_episode + 1)
            returns.append(G)

            last_100_returns.append(G)
            last_100_lengths.append(t_episode + 1)

            if i_episode % print_freq == 0:
                logger.record_tabular("time step", t_total)

                logger.record_tabular("episodes", i_episode)
                logger.record_tabular("step", t_episode + 1)
                logger.record_tabular("return", G)
                logger.record_tabular("mean reward", np.mean(last_100_returns))
                logger.record_tabular("mean length", np.mean(last_100_lengths))

                logger.record_tabular("% time spent exploring", int(100 * eps))
                logger.dump_tabular()

            # End of episode so reset time, reset rewards list
            t_episode = 0
            rewards = []

            # Environment terminated so reset it
            state = env.reset()

            # Increment the episode index
            i_episode += 1

        else:
            t_episode += 1

    return (
        dqn_model,
        np.array(returns),
        np.array(lengths),
        np.array(losses),
    )
示例#20
0
class DDQN_separated_net(Agent_segment):
    def __init__(self, epsilon=0.3, memory_size=300, batch_size=16, model=navigation_model,
                 target_update_interval=1,
                 tau=0.005):
        super(DDQN_separated_net, self).__init__(epsilon=epsilon,
                                                 random_can_stop=False)

        # Memory
        self.memory = ReplayMemory(memory_size)

        # Batch size when learning
        self.batch_size = batch_size

        # number of time steps before an update of the delayed target Q network
        self.target_update_interval = target_update_interval

        # soft update weight of the delayed Q network
        self.tau = tau

    def learned_act(self, s, pred_oracle=True, online=False):
        if online:
            if pred_oracle:
                return torch.cat([self.model(s), oracle(s).unsqueeze(1)], 1)
        with torch.no_grad():
            if pred_oracle:
                return torch.cat([self.target_model(s), oracle(s).unsqueeze(1)], 1)
                # to do without oracle

    def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember(s_, a_, n_s_, r_, game_over_)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)


        # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False
        non_final_mask = torch.cat(batch.game_over) == False

        non_final_next_states = torch.cat(batch.next_state)[non_final_mask]
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action).view(-1, 2)
        reward_batch = torch.cat(batch.reward)
        # non_final_next_states = torch.cat(batch.next_state)[non_final_index]

        # print(state_batch.shape)
        state_values = self.learned_act(state_batch, online=True)
        state_action_values = torch.cat(
            [s[a[0].item(), a[1].item()].unsqueeze(0) for s, a in zip(state_values, batch.action)])

        next_state_values = torch.zeros(self.batch_size, device=device)

        if len(non_final_next_states) > 0:
            with torch.no_grad():
                argmax_online = (self.learned_act(non_final_next_states, online=True)).view(non_final_next_states.shape[0],-1).argmax(1)
                # print(torch.tensor(range(self.batch_size), device=device)[non_final_mask])
                # print(self.learned_act(non_final_next_states, online=False).view(-1, 2*SEGMENT_LENGTH).shape)
                next_state_values[non_final_mask] = \
                self.learned_act(non_final_next_states, online=False).view(non_final_next_states.shape[0], -1)[
                    range(len(non_final_next_states)), argmax_online]

        expected_state_action_values = next_state_values + reward_batch

        loss = F.smooth_l1_loss(state_action_values[non_final_mask],
                                expected_state_action_values[non_final_mask])  # .unsqueeze(1))
        # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask])

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
            param.grad.data.clamp_(-1e-6, 1e-6)
        self.optimizer.step()

        if env_steps_ % self.target_update_interval == 0:
            soft_update(self.target_model, self.model, self.tau)

        return float(loss)

    def save_model(self, model_path='model.pickle'):
        try:
            torch.save(self.model, model_path)
        except:
            pass

    def load_model(self, model_path='model.pickle', local=True):
        if local:
            self.model = navigation_model()
            self.target_model = navigation_model()
            hard_update(self.target_model, self.model)
        else:
            self.model = torch.load('model.pickle')
            self.target_model = torch.load('model.pickle')
        if torch.cuda.is_available():
            print('Using GPU')
            self.model.cuda()
            self.target_model.cuda()
        else:
            print('Using CPU')
        self.optimizer = optim.RMSprop(self.model.parameters(), lr=1e-5)
示例#21
0
def init_train():
    train_data_file = args.train_buffer
    data_dir = os.path.split(train_data_file)[0]
    valid_data_file = args.valid_buffer
    #valid_data_file = '/usr/local/data/jhansen/planning/model_savedir/FRANKbootstrap_priorfreeway00/valid_set_small.npz'
    if args.model_loadpath == '':
        train_cnt = 0
        run_num = 0
        model_base_filedir = os.path.join(data_dir,
                                          args.savename + '%02d' % run_num)
        while os.path.exists(model_base_filedir):
            run_num += 1
            model_base_filedir = os.path.join(data_dir,
                                              args.savename + '%02d' % run_num)
        os.makedirs(model_base_filedir)
        model_base_filepath = os.path.join(model_base_filedir, args.savename)
        print("MODEL BASE FILEPATH", model_base_filepath)

        info = {
            'model_train_cnts': [],
            'model_train_losses_list': [],
            'model_valid_cnts': [],
            'model_valid_losses_list': [],
            'model_save_times': [],
            'model_last_save': 0,
            'model_last_plot': 0,
            'NORM_BY': 255.0,
            'model_model_loadpath': args.model_loadpath,
            'MODEL_MODEL_BASE_FILEDIR': model_base_filedir,
            'model_model_base_filepath': model_base_filepath,
            'model_train_data_file': train_data_file,
            'model_SAVENAME': args.savename,
            'DEVICE': DEVICE,
            'NUM_Z': args.num_z,
            'NUM_K': args.num_k,
            'NR_LOGISTIC_MIX': args.nr_logistic_mix,
            'BETA': args.beta,
            'ALPHA_REC': args.alpha_rec,
            'ALPHA_ACT': args.alpha_act,
            'ALPHA_REW': args.alpha_rew,
            'MODEL_BATCH_SIZE': args.batch_size,
            'NUMBER_CONDITION': args.number_condition,
            'MODEL_LEARNING_RATE': args.learning_rate,
            'MODEL_SAVE_EVERY': args.save_every,
        }

        ## size of latents flattened - dependent on architecture of vqvae
        #info['float_condition_size'] = 100*args.num_z
        ## 3x logistic needed for loss
        ## TODO - change loss
    else:
        print('loading model from: %s' % args.model_loadpath)
        model_dict = torch.load(args.model_loadpath)
        info = model_dict['model_info']
        model_base_filedir = os.path.split(args.model_loadpath)[0]
        model_base_filepath = os.path.join(model_base_filedir, args.savename)
        train_cnt = info['model_train_cnts'][-1]
        info['loaded_from'] = args.model_loadpath
        info['MODEL_BATCH_SIZE'] = args.batch_size
        #if 'reward_weights' not in info.keys():
        #    info['reward_weights'] = [1,100]
    # create replay buffer
    train_buffer = ReplayMemory(load_file=train_data_file)
    valid_buffer = ReplayMemory(load_file=valid_data_file)

    info['num_actions'] = train_buffer.num_actions()
    info['size_training_set'] = train_buffer.num_examples()
    info['hsize'] = train_buffer.frame_height
    info['wsize'] = train_buffer.frame_width
    info['num_rewards'] = train_buffer.num_rewards()

    rewards_weight = 1 - np.array(train_buffer.percentages_rewards())
    actions_weight = 1 - np.array(train_buffer.percentages_actions())
    actions_weight = torch.FloatTensor(actions_weight).to(DEVICE)
    rewards_weight = torch.FloatTensor(rewards_weight).to(DEVICE)
    info['actions_weight'] = actions_weight
    info['rewards_weight'] = rewards_weight

    # output mixtures should be 2*nr_logistic_mix + nr_logistic mix for each
    # decorelated channel
    info['HISTORY_SIZE'] = 4
    info['num_output_mixtures'] = (2 * args.nr_logistic_mix +
                                   args.nr_logistic_mix) * info['HISTORY_SIZE']
    nmix = int(info['num_output_mixtures'] / info['HISTORY_SIZE'])
    info['nmix'] = nmix
    vqvae_model = VQVAErl(
        num_clusters=info['NUM_K'],
        encoder_output_size=info['NUM_Z'],
        num_output_mixtures=info['num_output_mixtures'],
        in_channels_size=info['NUMBER_CONDITION'],
        n_actions=info['num_actions'],
        int_reward=info['num_rewards'],
    ).to(DEVICE)

    print('using args', args)
    parameters = list(vqvae_model.parameters())
    opt = optim.Adam(parameters, lr=info['MODEL_LEARNING_RATE'])
    if args.model_loadpath != '':
        print("loading weights from:%s" % args.model_loadpath)
        vqvae_model.load_state_dict(model_dict['modelvae_state_dict'])
        opt.load_state_dict(model_dict['model_optimizer'])
        vqvae_model.embedding = model_dict['model_embedding']

    #args.pred_output_size = 1*80*80
    ## 10 is result of structure of network
    #args.z_input_size = 10*10*args.num_z
    #train_cnt = train_vqvae(train_cnt, vqvae_model, opt, info, train_data_loader, valid_data_loader)
    run_vqvae(info,
              vqvae_model,
              opt,
              train_buffer,
              valid_buffer,
              num_samples_to_train=args.num_samples_to_train,
              save_every_samples=args.save_every)
示例#22
0

# Create default Malmo objects:
agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse(sys.argv)
except RuntimeError as e:
    print('ERROR:', e)
    print(agent_host.getUsage())
    exit(1)
if agent_host.receivedArgument("help"):
    print(agent_host.getUsage())
    exit(0)

net = cnn.Net(farm_size, 4)
memory = ReplayMemory(1000)

for episode in range(5):
    my_mission = MalmoPython.MissionSpec(missionXML, True)
    my_mission_record = MalmoPython.MissionRecordSpec()

    # Attempt to start a mission:
    max_retries = 3
    for retry in range(max_retries):
        try:
            agent_host.startMission(my_mission, my_mission_record)
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print("Error starting mission:", e)
                exit(1)
    env = Environment(rom_file=info['GAME'],
                      frame_skip=info['FRAME_SKIP'],
                      num_frames=info['HISTORY_SIZE'],
                      no_op_start=info['MAX_NO_OP_FRAMES'],
                      rand_seed=info['SEED'],
                      dead_as_end=info['DEAD_AS_END'],
                      max_episode_steps=info['MAX_EPISODE_STEPS'])

    # create replay buffer
    if info['REPLAY_MEMORY_LOADPATH'] == 0:
        replay_memory = ReplayMemory(
            action_space=env.action_space,
            size=info['BUFFER_SIZE'],
            frame_height=info['OBS_SIZE'][0],
            frame_width=info['OBS_SIZE'][1],
            agent_history_length=info['HISTORY_SIZE'],
            batch_size=info['BATCH_SIZE'],
            num_heads=info['N_ENSEMBLE'],
            bernoulli_probability=info['BERNOULLI_PROBABILITY'],
            #latent_frame_height=info['LATENT_SIZE'],
            #latent_frame_width=info['LATENT_SIZE'])
        )

    else:
        replay_memory = ReplayMemory(load_file=info['REPLAY_MEMORY_LOADPATH'])
        valid_replay_memory = ReplayMemory(
            load_file=info['REPLAY_MEMORY_VALID_LOADPATH'])
        start_step_number = replay_memory.count
    random_state = np.random.RandomState(info["SEED"])

    if args.model_loadpath != '':
        # load data from loadpath - save model load for later. we need some of
示例#24
0
        lambda: GCNBoard(env.n_resources + 2, 8, env.n_resources, env.n_nodes,
                         0.2), representation, env, 'board')
    trainer_node = []
    for i in range(env.adj.shape[0]):
        trainer_node.append(
            Trainer(
                lambda: GCNNode(env.n_resources + 2, 8, env.degree[i], env.
                                n_nodes, 0.2, 'node' + str(i)), representation,
                env, 'node'))
    trainer_score = ScorePredictionTrainer(
        lambda: ScorePredictionFunc(env.n_resources + 2, 8, env.n_nodes, 0.2),
        representation, env)

    mem_scores = ReplayMemory(500, {
        "sts": [env.adj.shape[0], env.n_resources],
        "features": [env.features.shape[0], env.features.shape[1]],
        "scores": []
    },
                              batch_size=20)
    mem_board = ReplayMemory(
        100, {
            "sts": [env.adj.shape[0], env.n_resources],
            "features": [env.features.shape[0], env.features.shape[1]],
            "pi": [env.n_resources + 1],
            "return": []
        })
    mem_node = []
    for i in range(env.adj.shape[0]):
        mem_node.append(
            ReplayMemory(
                100, {
                    "sts": [env.adj.shape[0], env.n_resources],
示例#25
0
def setup(data_dir, savename, train_data_file, model_loadpath=''):
    data_dir = os.path.split(train_data_file)[0]
    train_buffer = ReplayMemory(load_file=train_data_file)

    if args.model_loadpath == '':
        train_cnt = 0
        run_num = 0
        model_base_filedir = os.path.join(data_dir,
                                          savename + '%02d' % run_num)
        while os.path.exists(model_base_filedir):
            run_num += 1
            model_base_filedir = os.path.join(data_dir,
                                              savename + '%02d' % run_num)
        os.makedirs(model_base_filedir)
        model_base_filepath = os.path.join(model_base_filedir, savename)
        print("MODEL BASE FILEPATH", model_base_filepath)

        info = {
            'train_cnts': [],
            'train_losses_list': [],
            'valid_cnts': [],
            'valid_losses_list': [],
            'save_times': [],
            'savename': savename,
            'data_dir': datadir,
            #'args':[args],
            'last_save': 0,
            'last_plot': 0,
            'reward_weights': [1, 100],  # should be same as num_rewards
        }
    else:
        print('loading model from: %s' % model_loadpath)
        model_dict = torch.load(model_loadpath)
        info = model_dict['info']
        model_base_filedir = os.path.split(model_loadpath)[0]
        model_base_filepath = os.path.join(model_base_filedir, args.savename)
        train_cnt = info['train_cnts'][-1]
        info['loaded_from'] = model_loadpath
        if 'reward_weights' not in info.keys():
            info['reward_weights'] = [1, 100]

    num_actions = info['num_actions'] = n_actions
    num_rewards = info['num_rewards'] = len(train_data_loader.unique_rewards)
    args.size_training_set = train_data_loader.num_examples
    hsize = train_data_loader.data_h
    wsize = train_data_loader.data_w
    info['num_rewards'] = len(train_data_loader.unique_rewards)

    info['hsize'] = hsize
    info['num_channels'] = num_actions + 1 + 1

    #  !!!! TODO save this in npz and pull out
    #num_k = info['num_k'] = 512
    ###########################################3
    # load vq model
    vq_model_dict = torch.load(args.vq_model_loadpath,
                               map_location=lambda storage, loc: storage)
    vq_info = vq_model_dict['info']
    vq_largs = vq_info['args'][-1]
    nmix = int(vq_info['num_output_mixtures'] / 2)
    ###########################################3
    num_k = vq_largs.num_k
    vqvae_model = VQVAErl(num_clusters=num_k,
                          encoder_output_size=vq_largs.num_z,
                          num_output_mixtures=vq_info['num_output_mixtures'],
                          in_channels_size=vq_largs.number_condition,
                          n_actions=vq_info['num_actions'],
                          int_reward=vq_info['num_rewards']).to(DEVICE)
    vqvae_model.load_state_dict(vq_model_dict['vqvae_state_dict'])
    vqvae_model.eval()
    #conv_forward_model = ForwardResNet(BasicBlock, data_width=info['hsize'],
    #                                   num_channels=info['num_channels'],
    #                                   num_actions=num_actions,
    #                                   num_output_channels=num_k,
    #                                   num_rewards=num_rewards,
    #                                   dropout_prob=args.dropout_prob).to(DEVICE)

    conv_forward_model = ForwardResNet(
        BasicBlock,
        data_width=info['hsize'],
        num_channels=info['num_channels'],
        num_output_channels=num_k,
        dropout_prob=args.dropout_prob).to(DEVICE)

    # reweight the data based on its frequency
    info['actions_weight'] = 1 - np.array(
        train_data_loader.percentages_actions)
    info['rewards_weight'] = 1 - np.array(
        train_data_loader.percentages_rewards)
    actions_weight = torch.FloatTensor(info['actions_weight']).to(DEVICE)
    rewards_weight = torch.FloatTensor(info['rewards_weight']).to(DEVICE)
    parameters = list(conv_forward_model.parameters())
    opt = optim.Adam(parameters, lr=args.learning_rate)
    if args.model_loadpath != '':
        conv_forward_model.load_state_dict(model_dict['conv_forward_model'])
        opt.load_state_dict(model_dict['optimizer'])
    #args.pred_output_size = 1*80*80
    ## 10 is result of structure of network
    #args.z_input_size = 10*10*args.num_z
    train_cnt = train_forward(train_cnt)
示例#26
0
def train_gen_pg_each(generator, agent, discriminator, epoch, trainSample, subnum, optimizer_agent, optimizer_usr, batch_size, embed_dim, recom_length, max_length, real_label_num, device, gen_ratio, pretrain = False, shuffle_index=None):
    generator.train()
    agent.train()
    print('\nTRAINING : Epoch ' + str(epoch))
    generator.train()
    all_costs   = []
    logs        = []
    decay = 0.95
    gamma = 0.9
    max_norm=5
    all_num=0
    last_time = time.time()
     
    #Adjust the learning rate
    if epoch>1:
        optimizer_agent.param_groups[0]['lr'] = optimizer_agent.param_groups[0]['lr'] * decay
        optimizer_usr.param_groups[0]['lr'] = optimizer_usr.param_groups[0]['lr'] * decay
    print('Learning rate_agent : {0}'.format(optimizer_agent.param_groups[0]['lr']))
    print('Learning rate_usr : {0}'.format(optimizer_usr.param_groups[0]['lr']))
    
    #Generate subsamples
    trainSample_sub = Sample()
    trainSample_sub.subSample_copy(subnum, trainSample, shuffle_index)
    for stidx in range(0, trainSample_sub.length(), batch_size):
        # prepare batch
        embed_batch, length, _, reward_batch, action_batch = getBatch_dis(stidx, stidx + batch_size, trainSample_sub, embed_dim, recom_length) 
        embed_batch, reward_batch, action_batch = Variable(embed_batch.to(device)), Variable(reward_batch.to(device)), Variable(action_batch.to(device)) 
        k = embed_batch.size(0) #Actual batch size
        replay = ReplayMemory(generator, agent, int((1+gen_ratio)*k), max_length, real_label_num, action_batch.size(1))
        replay.init_click((embed_batch, length), reward_batch, action_batch)
        replay.gen_sample(batch_size, True, discriminator)
        tgt_reward, gen_reward, usr_prob, agent_prob = replay.tgt_rewards.type(torch.FloatTensor).to(device), replay.gen_rewards.type(torch.FloatTensor).to(device), replay.usr_probs.to(device), replay.agent_probs.to(device)
         
        tgt_prob = torch.abs(1.0-torch.round(tgt_reward)-tgt_reward)
        tgt_reward = torch.round(tgt_reward)
        if not pretrain: 
            loss_usr = -((torch.log(usr_prob + 1e-12) + torch.log(tgt_prob + 1e-12)) * gen_reward).sum()/k
        #Calculate the cumulative reward
        tgt_reward = gen_reward * (1 + tgt_reward)
        tgt_value = generator.value(tgt_reward)
        #loss_agent = -(torch.log(agent_prob + 1e-12) * (gen_reward + tgt_value)).sum()/k #+ 1e-18
        loss_agent = -(torch.log(agent_prob + 1e-12) * (tgt_value)).sum()/k #+ 1e-18
        all_costs.append(loss_agent.data.cpu().numpy())
        # backward
        optimizer_agent.zero_grad()
        optimizer_usr.zero_grad()
        if not pretrain:
            loss_usr.backward(retain_graph=True) 
            #Print gradients for each layer
            '''
            print("Gradients for user behavior models:")
            print("Embedding:")
            generator.embedding.print_grad()
            print("Encoder:")
            generator.encoder.print_grad()
            print("MLPlayer:")
            print(generator.enc2out.weight.grad)
            '''
            #Gradient clipping
            clip_grad_value_(filter(lambda p: p.requires_grad, generator.parameters()), 1)
            #clip_grad_norm_(filter(lambda p: p.requires_grad, generator.parameters()), 5)
            optimizer_usr.step()
        loss_agent.backward()
        #Gradient clipping
        clip_grad_value_(filter(lambda p: p.requires_grad, agent.parameters()), 1)
        #clip_grad_norm_(filter(lambda p: p.requires_grad, agent.parameters()), 5)
        # optimizer step
        optimizer_agent.step()
        # Printing
        if len(all_costs) == 100:
            logs.append( '{0} ; loss {1} ; seq/s {2}'.format(stidx, round(np.mean(all_costs),2), int(len(all_costs) * batch_size / (time.time() - last_time))))
            print(logs[-1])
            last_time = time.time()
            all_costs = []
    return all_costs
    info['num_rewards'] = len(info['REWARD_SPACE'])

    # create environment
    env = Environment(rom_file=info['GAME'],
                      frame_skip=info['FRAME_SKIP'],
                      num_frames=info['HISTORY_SIZE'],
                      no_op_start=info['MAX_NO_OP_FRAMES'],
                      rand_seed=info['SEED'],
                      dead_as_end=info['DEAD_AS_END'],
                      max_episode_steps=info['MAX_EPISODE_STEPS'])

    # create replay buffer
    replay_memory = ReplayMemory(
        size=info['BUFFER_SIZE'],
        frame_height=info['OBS_SIZE'][0],
        frame_width=info['OBS_SIZE'][1],
        agent_history_length=info['HISTORY_SIZE'],
        batch_size=info['BATCH_SIZE'],
        num_heads=info['N_ENSEMBLE'],
        bernoulli_probability=info['BERNOULLI_PROBABILITY'])
    latent_replay_memory = ReplayMemory(
        size=info['BUFFER_SIZE'],
        frame_height=info['LATENT_SIZE'],
        frame_width=info['LATENT_SIZE'],
        agent_history_length=info['HISTORY_SIZE'],
        batch_size=info['BATCH_SIZE'],
        num_heads=info['N_ENSEMBLE'],
        bernoulli_probability=info['BERNOULLI_PROBABILITY'])

    random_state = np.random.RandomState(info["SEED"])

    if args.model_loadpath != '':
示例#28
0
def make_random_subset_buffers(dataset_path,
                               buffer_path,
                               train_max_examples=100000,
                               kernel_size=(2, 2),
                               trim_before=0,
                               trim_after=0):
    sys.path.append('../agents')
    from replay import ReplayMemory
    # keep max_examples < 100000 to enable knn search
    # states [top of image:bottom of image,:]
    # in breakout - can safely reduce size to be 40x40 of the given image
    # try to get an even number of each type of reward

    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    buffer_name = os.path.split(buffer_path)[1]
    buffers = {}
    paths = {}
    for phase in ['valid', 'train']:
        if phase == 'valid':
            max_examples = int(0.15 * train_max_examples)
        else:
            max_examples = train_max_examples
        small_name = buffer_name.replace(
            '.npz', '_random_subset_%06d_%sx%stb%sta%s_%s.npz' %
            (max_examples, kernel_size[0], kernel_size[1], trim_before,
             trim_after, phase))
        small_path = os.path.join(dataset_path, small_name)
        paths[phase] = small_path
        if os.path.exists(small_path):
            print('loading small buffer path')
            print(small_path)
            sbuffer = ReplayMemory(load_file=small_path)
            sbuffer.init_unique()
            buffers[phase] = sbuffer

    # if we dont have both train and valid - make completely new train/valid set
    if not len(buffers.keys()) == 2:
        print('creating new train/valid buffers')
        load_buffer = ReplayMemory(load_file=buffer_path)
        orig_states = []
        small_states = []
        for index in range(10, 400):
            if load_buffer.is_valid_index(index):
                s, _ = load_buffer._get_state(index)
                orig_states.append(s[-1])
                small_states.append(
                    load_buffer.online_shrink_frame_size(
                        s[-1], trim_before, kernel_size, trim_after))
        bdir = small_path.replace('.npz', '')
        if not os.path.exists(bdir):
            os.makedirs(bdir)
        image_path = os.path.join(bdir, 'step_%03d.png')
        movie_path = os.path.join(bdir, 'movie.mp4')
        for index in range(len(orig_states)):
            f, ax = plt.subplots(1, 2)
            ax[0].matshow(orig_states[index])
            ax[1].matshow(small_states[index])
            plt.savefig(image_path % index)
            plt.close()
        cmd = "ffmpeg -n -r 10 -i %s -c:v libx264 -pix_fmt yuv420p %s" % (
            os.path.abspath(image_path), os.path.abspath(movie_path))
        print(cmd)
        os.system(cmd)

        if max(list(kernel_size) + [trim_before, trim_after]) > 1:
            load_buffer.shrink_frame_size(kernel_size=kernel_size,
                                          reduction_function=np.max,
                                          trim_before=trim_before,
                                          trim_after=trim_after)

        #for r in range(states.shape[0]):
        #    imwrite('mp%s.png'%r, states[r,-1])

        load_buffer.reset_unique()
        # history_length + 1 for every random example
        frame_multiplier = (load_buffer.agent_history_length + 1)
        #frame_multiplier = 2
        total_frames_needed = int((max_examples * 1.15) * frame_multiplier) + 1
        # not sure why we weren't allowing overlapping frames
        #total_frames_needed = int((max_examples*1.15))
        if load_buffer.count < total_frames_needed % load_buffer.size:
            raise ValueError(
                'load buffer is not large enough (%s) to collect number of examples (%s)'
                % (load_buffer.count, total_frames_needed))
        print('loading prescribed buffer path.... this may take a while')
        print(buffer_path)
        for phase in ['valid', 'train']:
            if phase == 'valid':
                max_examples = int(0.15 * train_max_examples)
            else:
                max_examples = train_max_examples
            print('creating small %s buffer with %s examples' %
                  (phase, max_examples))
            # actions for breakout:
            # ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
            frames_needed = max_examples * frame_multiplier
            sbuffer = ReplayMemory(
                frames_needed,
                frame_height=load_buffer.frame_height,
                frame_width=load_buffer.frame_width,
                agent_history_length=load_buffer.agent_history_length)

            num_examples = 0
            while num_examples < max_examples:
                batch = load_buffer.get_unique_minibatch(1)
                states, actions, rewards, next_states, real_terminal_flags, _, unique_indices, index_indices = batch
                bs, num_hist, h, w = states.shape
                # action is the action that was used to get from state to next state
                #    t-3, t-2, t-1, t-1, t
                #  s-4, s-3, s-2, s-1
                #     s-3, s-2, s-1, s

                past_indices = np.arange(unique_indices - (num_hist),
                                         unique_indices + 1)
                for batch_idx in range(bs):
                    # get t-4 thru t=0
                    # size is bs,5,h,w
                    all_states = np.hstack((states[:, 0:1], next_states))
                    for ss in range(num_hist + 1):
                        # only use batch size 1 in minibatch
                        # frame is "next state" in replay buffer
                        frame = all_states[batch_idx, ss]
                        action = load_buffer.actions[past_indices[ss]]
                        reward = load_buffer.rewards[past_indices[ss]]
                        if ss == num_hist:
                            # this is the observed state and the only one we will
                            # use a true action/reward for
                            #action = actions[batch_idx]
                            #reward = rewards[batch_idx]
                            terminal_flag = True
                            end_flag = True
                            num_examples += 1
                            if not num_examples % 5000:
                                print('added %s examples to %s buffer' %
                                      (num_examples, phase))
                        else:
                            # use this to debug and assert that all actions/rewards
                            # in sampled minibatch of sbuffer are < 99
                            terminal_flag = False
                            end_flag = False

                        sbuffer.add_experience(action, frame, reward,
                                               terminal_flag, end_flag)
            sbuffer.rewards = sbuffer.rewards.astype(np.int32)
            sbuffer.init_unique()
            sbuffer.save_buffer(paths[phase])
            buffers[phase] = sbuffer
    return buffers, paths