Python PolicyGradient.choose_action示例，policy_gradient.PolicyGradient.choose_action Python示例

示例#1

0

显示文件

文件： agentpolicygradient.py 项目： o-can/lunarlander-workshop

class AgentPolicyGradient:
    def __init__(self, 
                 n_x,
                 n_y,
                 learning_rate = 0.02,
                 reward_decay=0.99,
                 load_path=None, 
                 save_path=None):
        self.PG = PolicyGradient(n_x, n_y,
                learning_rate=learning_rate,
                reward_decay=reward_decay,
                load_path=load_path,
                save_path=save_path
                )
        
        
    def choose_action(self, observation):
        return self.PG.choose_action(observation)
    
    def store_transition(self, s, a, r):
        return self.PG.store_transition(s,a,r)
    
    
    def learn(self):
        return self.PG.learn()
    
    
    def plot_cost(self):
        import matplotlib
        matplotlib.use('TkAgg')
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.PG.cost_history)), self.PG.cost_history)
        plt.ylabel('Cost Ex')
        plt.xlabel('Training Steps Ex')
        plt.show()    
        
    def crashed(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum < -250
    
    def episode_reward(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum
    

    def costs(self):
        return self.PG.costs()

示例#2

0

显示文件

                        n_y=env.action_space.n,
                        learning_rate=0.01,
                        reward_decay=0.95,
                        load_path=load_path,
                        save_path=save_path)

    for episode in range(EPISODES):  # start nauki

        observation = env.reset()
        episode_reward = 0

        while True:
            if RENDER_ENV: env.render()

            # 1. Choose an action based on observation
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward, done, info = env.step(action)

            # 3. Store transition for training
            PG.store_transition(observation, action, reward)

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)

                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)

示例#3

0

显示文件

文件： simulation.py 项目： wangyifeibeijing/Reproduce-of-Top-K-Off-Policy-Correction-for-a-REINFORCE-Recommender-System

def simulation():
    users_num = 1
    action_rewards = [10, 9, 1, 1, 1, 1, 1, 1, 1, 1]
    actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    observations = [[random.randint(0, i * 10) for i in range(1, 4)]
                    for j in range(1, 101)]
    # nums of items to recommend
    K = 2
    load_version = 1
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 5000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[random.randint(0,
                                                       len(observations) - 1)],
                        learning_rate=0.005,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = observations[random.randint(
                    0,
                    len(observations) - 1)]

            # 1. Choose an action based on observation
            # action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observations[random.randint(
                0,
                len(observations) - 1)], action_rewards[action]

            # 4. Store transition for training
            PG.store_transition(observation, action, reward)

            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)
                #print(PG.outputs_softmax)
                print("distribution at {} is :{}".format(
                    PG.s0, PG.get_distribution(PG.s0)))
                # 5. Train neural network
                discounted_episode_rewards_norm = PG.learn()
                break

            # Save new observation
            observation = observation_

    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(PG.s0))
    plt.xlabel("action")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("top-k correction policy")
    plt.show()

示例#4

0

显示文件

文件： run.py 项目： tangdaizheng/Policy-Gradient

env.seed(1)
env = env.unwrapped

PG = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    lr=0.02,
                    gamma=0.99,
                    output_graph=FLAGS.output_graph)

for i in range(FLAGS.episode):
    s = env.reset()

    while True:
        if RENDER:
            env.render()
        action = PG.choose_action(s)
        s_, r, done, info = env.step(action)
        PG.store_transition(s_, action, r)
        if done:
            episode_rs_sum = sum(PG.ep_rs)
            if 'running_reward' not in globals():
                running_reward = episode_rs_sum
            else:
                running_reward = running_reward * 0.99 + episode_rs_sum * 0.01
            if running_reward > FLAGS.display_threshold:
                RENDER = True
            print('episode:', i, ' reward:', running_reward)

            norm_reward = PG.learn()

            if i == 30:

示例#5

0

显示文件

文件： simulation_ltv.py 项目： wangyifeibeijing/Reproduce-of-Top-K-Off-Policy-Correction-for-a-REINFORCE-Recommender-System

def simulation():
    users_num = 1
    '''
	action_rewards = {'11':4,'12':1,'13':1,'14':1,'21':1,'22':2,'23':3,'24':16,'31':1,'32':2,'33':3,'34':4}
	observation_action_transfer = {'11':[2],'12':[2],'13':[2],'14':[2],'21':[3],'22':[3],'23':[3],'24':[3],\
			'31':[1],'32':[1],'33':[3],'34':[3]}
	actions = [1,2,3,4]
	observations = [[1],[2],[3]]
	'''

    action_rewards = {'11': 5,'12': 0,'13': 0,'14':0,'15':0,'16':13, \
          '21': 10,'22': 0, '23': 0,'24':0,'25':0,'26':8}
    observation_action_transfer = {'11': [1,1], '12': [1,1], '13': [1,1],'14':[1,1],'15':[1,1],'16':[1,1], \
              '21': [1,1], '22': [1,1], '23': [1,1],'24':[1,1],'25':[1,1],'26':[0,1]}

    actions = [1, 2, 3, 4, 5, 6]
    observations = [[0, 1], [1, 1]]

    # nums of items to recommend
    K = 2
    load_version = 4
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 3000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[-1],
                        learning_rate=0.001,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = PG.episode_observations[-1]
                #print(observation)

            # 1. Choose an action based on observation
            #action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observation_action_transfer[str(sum(observation))+str(actions[action])], \
                    action_rewards[str(sum(observation))+str(actions[action])]

            # 4. Store transition for training
            PG.store_transition(observation_, action, reward)
            #print(PG.episode_observations)
            #print(PG.episode_actions)
            #print(PG.episode_rewards)
            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)

                #print(PG.outputs_softmax)
                #print(PG.episode_rewards)
                # 5. Train neural network
                print("distribution at {} is :{}".format(
                    observations[0], PG.get_distribution(observations[0])))
                print("distribution at {} is :{}".format(
                    observations[1], PG.get_distribution(observations[1])))
                discounted_episode_rewards_norm = PG.learn()

                break

            # Save new observation
            observation = observation_
    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(observations[0]))
    plt.xlabel("action at state[0,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[0,1]")
    plt.show()
    plt.bar(actions, PG.get_distribution(observations[1]))
    plt.xlabel("action at state[1,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[1,1]")
    plt.show()

示例#6

0

显示文件

文件： market_pg.py 项目： dxcv/TradingAlgo

    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        avg_reward_sum = 0.

        #f_eps = open("episode.csv","w")
        #write_eps = csv.write(f_eps)

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            #f_iter = open("episode_{0}.csv".format(e),"w")
            #write_iter = csv.writer(f_iter)
            f_episode = "episode_{0}.csv".format(e)
            os.system("rm -rf {0}".format(f_episode))

            print(observation[0].shape, observation[1].shape)

            RL = PolicyGradient(
                n_actions=self.env.action_space.n,
                #				n_features=observation.shape[0],
                learning_rate=0.02,
                reward_decay=0.995,
                # output_graph=True,
            )

            while not game_over:

                action, aprob = RL.choose_action(observation)

                inputs.append(observation)
                predicteds.append(aprob)

                y = np.zeros([self.env.action_space.n])
                y[action] = 1.
                outputs.append(y)

                observation, reward, actual_reward, game_over, info = self.env._step(
                    action)
                reward_sum += float(actual_reward)

                #rewards.append(float(reward))
                rewards.append(float(reward_sum))

                RL.store_transition(observation, action, rewards)

                # check memory for RNN model
                if len(inputs) > self.max_memory:
                    del inputs[0]
                    del outputs[0]
                    del predicteds[0]
                    del rewards[0]

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
                    os.system("echo %s >> %s" %
                              ("%s:\t%s\t%.2f\t%.2f\t" %
                               (info["dt"], env.actions[action], reward_sum,
                                info["cum"]) +
                               ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])), f_episode))

                avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
                toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                    e, info["code"],
                    (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                    ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                    avg_reward_sum)
                print(toPrint)
                if self.history_filename != None:
                    os.system("echo %s >> %s" %
                              (toPrint, self.history_filename))

                discounted_rewards_ = RL.learn()  # train

                dim = len(inputs[0])
                inputs_ = [[] for i in range(dim)]
                for obs in inputs:
                    for i, block in enumerate(obs):
                        inputs_[i].append(block[0])
                inputs_ = [np.array(inputs_[i]) for i in range(dim)]

                outputs_ = np.vstack(outputs)
                predicteds_ = np.vstack(predicteds)
                rewards_ = np.vstack(rewards)

                print("shape: ", np.shape(rewards),
                      np.shape(discounted_rewards_))
                #outputs_ *= discounted_rewards_
                for i, r in enumerate(zip(rewards, discounted_rewards_)):
                    reward, discounted_reward = r

                    if verbose > 1:
                        #						print (outputs_[i],)
                        print(outputs_[i], )

                    if verbose > 0:
                        print(predicteds_[i], outputs_[i], reward,
                              discounted_reward)

                print("fit model input.shape %s, output.shape %s" %
                      ([inputs_[i].shape
                        for i in range(len(inputs_))], outputs_.shape))

                np.set_printoptions(linewidth=200, suppress=True)
                print("currentTargetIndex:", env.currentTargetIndex)

示例#7

0

显示文件

class runPG():
    n_inputs = 4
    n_outputs = 4  # right and left for each finger
    # n_outputs = 8 # right, left and stop for each finger

    net = 0
    X = 0
    A = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1], [0, -1], [0, 1], [-1, 0],
                  [1, 0]])

    mode = 5
    reward_mode = 2

    R = []

    gripper_closed = False
    stLearning = True
    possible_plot = False

    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8

        self.RL = PolicyGradient(
            n_actions=self.n_outputs,
            n_features=self.n_inputs,
            learning_rate=0.02,
            reward_decay=0.99,
            load_saved_net=False,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty)
        close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty)

        rospy.sleep(3)
        o = open_srv()

        episode_count = 0
        rate = rospy.Rate(15)  # 15hz
        while not rospy.is_shutdown():

            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                # Close gripper
                raw_input(
                    "Place object between fingers and press Enter to close gripper..."
                )
                close_srv()
                while not self.gripper_closed:
                    rate.sleep()

                raw_input("Remove table and press Enter to start episode...")

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                while True:
                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv(
                        ).dropped  # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr(
                            '[RL] Failed to move gripper. Episode declared failed.'
                        )
                        fail = True

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) +
                              ", episode reward: " + str(ep_rs_sum) +
                              ", running reward: " + str(int(running_reward)) +
                              " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            # Open gripper
            if self.gripper_closed:
                o = open_srv()
                rospy.sleep(0.2)
            # self.stLearning = False

            # print(obs_srv().state)

            # rospy.spin()
            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)), self.R)  # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Get to a certain coodrinate
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail

            if obs[0] > 135.:
                raw_input('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        return reward, done

示例#8

0

显示文件

n_actions = env.action_space.n
n_features = env.observation_space.shape[0]

RL = PolicyGradient(n_actions=n_actions,
                    n_features=n_features,
                    learning_rate=0.02,
                    reward_decay=0.99)

for i_episode in range(3000):
    observation = env.reset()  # 车的位置，杆子的角度，车速，角度变化率

    while True:
        if RENDER:
            env.render()
        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
                time.sleep(2)

示例#9

0

显示文件

文件： run_CartPole.py 项目： zizhuxishui/DeepReinforcementLearning-Tensorflow

RL = PolicyGradient(
		s_dim = env.observation_space.shape[0],
		a_dim = env.action_space.n,
		learning_rate = 0.02,
		reward_decay = 0.99,
		#output_graph = True
	)

for i_epsiode in range(3000):

	s = env.reset()
	while True:
		if RENDER: env.render()

		a = RL.choose_action(s)
		s_,r,done,info = env.step(a)

		RL.store_transition(s,a,r)

		if done:
			ep_rs_sum = sum(RL.ep_rs)

			if 'running_reward' not in globals():
				running_reward = ep_rs_sum
			else:
				running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

			if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
			print('episode:',i_epsiode,"reward:",int(running_reward))

示例#10

0

显示文件

            episode_reward = 0
            h = 5
            l = 1
            pizza_lines = ["TMMMTTT","MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM"]
            pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h }
            state = env.init(pizza_config)[0]
            print("\nPIZZA CONFIG: ", pizza_config)
            print("\nSTATE: ", state)
            print("\n\nSTATE[0]", state[0])
	    #state[0] #get only first value of tuple
            for step in range(STEPS):
                if RENDER_ENV:
                    env.render()
                # sample one action with the given probability distribution
                # 1. Choose an action based on observation
                action = PG.choose_action(state)

                # 2. Take action in the environment
                state_, reward, done, info = env.step(ACTIONS[action])

                # 3. Store transition for training
                PG.store_transition(preprocess(state), action, reward)

                # Save new state
                #state = state_
                if done:
                    episode_rewards_sum = sum(PG.episode_rewards)
                    rewards.append(episode_rewards_sum)
                    max_reward_so_far = np.amax(rewards)

                    print("==========================================")

示例#11

0

显示文件

文件： runPG_sim.py 项目： avishais/ROShand

class runPG():
    n_inputs = 4
    # n_outputs = 4 # right and left for each finger
    n_outputs = 8 # right, left and stop for each finger
    max_episodes = 1200
    max_steps = 2500

    net = 0
    X = 0
    A = np.array([[-1, -1], [1, -1], [-1, 1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]])

    mode = 5
    reward_mode = 3

    R = []
    g = np.array([-35.0, 104.0], dtype='f') # Goal

    gripper_closed = False
    stLearning = True # Enable learning
    possible_plot = False

    # For reward mode 3
    prev_dis2goal = 1e9


    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8
        
        self.RL = PolicyGradient(
            n_actions = self.n_outputs,
            n_features = self.n_inputs,
            learning_rate=0.001,
            reward_decay=0.98,
            load_saved_net=True,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        episode_count = 0
        rate = rospy.Rate(100) # 100hz
        while not rospy.is_shutdown():
            
            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                self.prev_dis2goal = 1e9

                # Set gripper
                reset_srv()
                while not self.gripper_closed:
                    rate.sleep()

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                step = 0
                while True:
                    step += 1
                    print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal))
                    pub_goal.publish(gg)

                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv().dropped # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr('[RL] Failed to move gripper. Episode declared failed.')
                        fail = True 

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if step > self.max_steps:
                        done = True

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            if self.max_episodes < episode_count:
                self.plot_sav()
                break

            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)),self.R)    # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Cross a line
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail
            
            if obs[0] > 40.:
                print('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        # Get to a certain coordinate
        if self.reward_mode == 3:
            
            d = np.linalg.norm(self.g-obs[:2])

            if fail or d > self.prev_dis2goal:
                reward = 0.
            else:
                reward = 1.
            done = fail
            
            if d < 5:
                print('Reached goal, (x,y) = (%f,%f).' % (obs[0],obs[1]))
                reward = 50.
                done = True
            
            self.prev_dis2goal = d

        return reward, done