'batch_size': 32, 'state_space_dim': env.state_dim, 'action_space_dim': env.action_dim, } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 while True: env.render() a0 = agent.act(s0) s1, r1, done= env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() score.append(total_reward) mean.append( sum(score[-100:])/100)
############################################### import os import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from env import ArmEnv from rl import DDPG MAX_EPISODES = 500 MAX_EP_STEPS = 200 env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound rl = DDPG(a_dim, s_dim, a_bound) for i in range(MAX_EPISODES): s = evn.reset() for j in range(MAX_EP_STEPS): env.render() a = rl.choose_actions(s) s_, r, done = env.step(a) rl.store_transition(s, a, r, s_) if rl.memory_full(): rl.learn() s = s_
# Gloabel Variable MAX_EPISOSES = 500 MAX_EP_STEPS = 500 # Set the environement env = ArmEnv() s_dim = env.state_dim a_dim = env.action_dim a_bound = env.action_bound # set the RL method rl = DDPG(a_dim, s_dim, a_bound) # start Training for i in range(MAX_EPISOSES): s = env.reset() for j in range(MAX_EP_STEPS): env.render() a = rl.choose_action(s) s_, r, done = env.step(a) rl.store_transitions(s, a, r, s_) if rl.memory_full: # start to learn once has fulfulled the memory rl.learn() s = s_