def test_readme(self): environment = UnittestEnvironment(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5)) def get_current_state(): return environment.reset() def execute_decision(x): return environment.execute(actions=x)[2] # Instantiate a Tensorforce agent agent = PPOAgent(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5), memory=10000, network='auto', update_mode=dict(unit='episodes', batch_size=10), step_optimizer=dict(type='adam', learning_rate=1e-4)) # Initialize the agent agent.initialize() # Retrieve the latest (observable) environment state state = get_current_state() # (float array of shape [10]) # Query the agent for its action decision action = agent.act(states=state) # (scalar between 0 and 4) # Execute the decision and retrieve the current performance score reward = execute_decision(action) # (any scalar float) # Pass feedback about performance (and termination) to the agent agent.observe(reward=reward, terminal=False) agent.close() environment.close() self.assertTrue(expr=True)
actions={ "up": dict(type="float", min_value=0.0, max_value=1.0), "down": dict(type="float", min_value=0.0, max_value=1.0), "left": dict(type="float", min_value=0.0, max_value=1.0), "right": dict(type="float", min_value=0.0, max_value=1.0), }, network='auto', memory=10000, ) else: print("Available agents: vpg, ppo, dqn") exit() print("agent ready", agent) agent.initialize() # Set up base of agent try: # Looks to see if a saved model is available and loads it lastEpoch = int( os.listdir(tmp + "/saved/player_pun/" + args.agent)[2].split("-")[0]) agent.restore(directory=tmp + "/saved/player_pun/" + args.agent) print("restored") except Exception as e: # starts fresh if no saved model is available print("DID NOT RESTORE") lastEpoch = 0 epochs = 2000000 for epoch in tqdm(range(lastEpoch, epochs + 1)): #print(epoch)
"user": dict(type="int", num_values=G.graph.shape[0]), "item": dict(type="int", num_values=G.graph.shape[1]) }, network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=10000, ) print("agent ready", agent) if args.process == "train": new_agent = copy.deepcopy(agent) agent.initialize() try: lastEpoch = int(os.listdir("saved/" + args.agent)[2].split("-")[0]) agent.restore(directory="saved/" + args.agent + "/" + args.contrarian) print("restored") except: lastEpoch = 0 epochs = 100000 cluster_vals = [] for epoch in tqdm(range(lastEpoch, epochs)): G = Audience(20, 15) #20 reccomendations for every user
class SerpentPPO: def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() print('game inputs mapping:') print(self.game_inputs_mapping) actions_spec = {"type": "int", "num_values": len(self.game_inputs)} summary_spec = { "directory": "./board/", "steps": 50, "labels": [ "configuration", "gradients_scalar", "regularization", "inputs", "losses", "variables" ] } network_spec = [{ "type": "conv2d", "size": 16, "window": 8, "stride": 4 }, { "type": "conv2d", "size": 32, "window": 4, "stride": 2 }, { "type": "conv2d", "size": 32, "window": 3, "stride": 1 }, { "type": "flatten" }, { "type": "dense", "size": 64 }] baseline_spec = { "type": "cnn", "conv_sizes": [32, 32], "dense_sizes": [32] } saver_spec = { "directory": os.path.join(os.getcwd(), "datasets", "t4androidmodel"), "seconds": 120 } # memory_spec = {'type':'latest', 'include_next_states':False, 'capacity':1000*1000} self.agent = PPOAgent( states=states_spec, actions=actions_spec, network=network_spec, # baseline_mode='states', # baseline=baseline_spec, summarizer=summary_spec, memory=10, update_mode=dict(unit='timesteps', batch_size=2), discount=0.97, saver=saver_spec) self.agent.initialize() # # batched_observe=2560, # scope="ppo", # summarizer=summary_spec, # network=network_spec, # device=None, # session_config=None, # saver_spec=None, # distributed_spec=None, # discount=0.97, # variable_noise=None, # states_preprocessing_spec=None, # explorations_spec=None, # reward_preprocessing_spec=None, # distributions_spec=None, # entropy_regularization=0.01, # batch_size=2560, # keep_last_timestep=True, # baseline_mode=None, # baseline=None, # baseline_optimizer=None, # gae_lambda=None, # likelihood_ratio_clipping=None, # step_optimizer=None, # optimization_steps=10 # # ) def generate_action(self, game_frame_buffer): states = np.stack(game_frame_buffer, axis=2) action = self.agent.act(states) label = self.game_inputs_mapping[action] return action, label, self.game_inputs[label] def observe(self, reward=0, terminal=False): self.agent.observe(reward=reward, terminal=terminal) def _generate_game_inputs_mapping(self): mapping = dict() for index, key in enumerate(self.game_inputs): mapping[index] = key return mapping