Exemplo n.º 1
0
def make_reward_table(options):
    """Makes .csv table with rewards for all contracts."""
    print 'Reward table is generating'
    locations_dict = {loc.name: loc for loc in LOCATIONS}
    rows = [['Class', 'Departure', 'Destination', 'Distance', 'Min reward', 'Max reward']]
    for route, contract in ROUTES.iteritems():
        contract.set_locations(locations_dict[route[0]], locations_dict[route[1]])
        advance_funds, reward_funds, _, _ = contract.get_rewards()

        if options.verbose > 0:
            print 'Calculating reward for {}'.format(contract)
        reward_str = '{} + ({} + {}) * Random(1.0, 1.15)'.format(
            advance_funds, reward_funds, contract.refund_amount,
        )
        min_reward = utils.calculate_reward(contract, reward_str, calc_min=True)
        max_reward = utils.calculate_reward(contract, reward_str, calc_min=False)

        rows.append([
            contract.__class__.__name__,
            contract.from_loc.name,
            contract.to_loc.name,
            str(round(utils.loc_distance(contract.from_loc, contract.to_loc), 2)),
            str(min_reward),
            str(max_reward),
        ])
    if options.verbose > 1:
        print 'Writing file Rewards.csv'
    with open('Rewards.csv', 'w') as out:
        out.write('\n'.join([','.join(row) for row in rows]) + '\n')
Exemplo n.º 2
0
    def run_test_rewards(self):
        if not self.prop_id:
            # run the proposal scenario first
            self.run_test_proposal()

        debate_secs = 15
        self.create_js_file(
            'rewards',
            {
                "dao_abi": self.dao_abi,
                "dao_address": self.dao_addr,
                "total_rewards": self.args.total_rewards,
                "proposal_deposit": self.args.proposal_deposit,
                "transaction_bytecode": '0x0',  # fallback function
                "debating_period": debate_secs,
                "prop_id": self.next_proposal_id()
            }
        )
        print(
            "Notice: Debate period is {} seconds so the test will wait "
            "as much".format(debate_secs)
        )
        output = self.run_script('rewards.js')
        results = eval_test('rewards', output, {
            "provider_reward_portion": calculate_reward(
                self.token_amounts[0],
                self.total_supply,
                self.args.total_rewards)
        })
        self.dao_balance_after_rewards = results['DAO_balance']
        self.dao_rewardToken_after_rewards = results['DAO_rewardToken']
Exemplo n.º 3
0
def make_reward_table(options):
    """Makes .csv table with rewards for all contracts."""
    print 'Reward table is generating'
    locations_dict = {loc.name: loc for loc in LOCATIONS}
    rows = [[
        'Class', 'Departure', 'Destination', 'Distance', 'Min reward',
        'Max reward'
    ]]
    for route, contract in ROUTES.iteritems():
        contract.set_locations(locations_dict[route[0]],
                               locations_dict[route[1]])
        advance_funds, reward_funds, _, _ = contract.get_rewards()

        if options.verbose > 0:
            print 'Calculating reward for {}'.format(contract)
        reward_str = '{} + ({} + {}) * Random(1.0, 1.15)'.format(
            advance_funds,
            reward_funds,
            contract.refund_amount,
        )
        min_reward = utils.calculate_reward(contract,
                                            reward_str,
                                            calc_min=True)
        max_reward = utils.calculate_reward(contract,
                                            reward_str,
                                            calc_min=False)

        rows.append([
            contract.__class__.__name__,
            contract.from_loc.name,
            contract.to_loc.name,
            str(
                round(utils.loc_distance(contract.from_loc, contract.to_loc),
                      2)),
            str(min_reward),
            str(max_reward),
        ])
    if options.verbose > 1:
        print 'Writing file Rewards.csv'
    with open('Rewards.csv', 'w') as out:
        out.write('\n'.join([','.join(row) for row in rows]) + '\n')
Exemplo n.º 4
0
 def process_graph(self, graph_path, batch_loss):
     """
     Reading a graph and doing a forward pass on a graph with a time budget.
     :param graph_path: Location of the graph to process.
     :param batch_loss: Loss on the graphs processed so far in the batch.
     :return batch_loss: Incremented loss on the current batch being processed.
     """
     data = json.load(open(graph_path))
     graph, features = create_features(data, self.model.identifiers)
     node = random.choice(list(graph.nodes()))
     attention_loss = 0
     for t in range(self.args.time):
         predictions, node, attention_score = self.model(data, graph, features, node)
         target, prediction_loss = calculate_predictive_loss(data, predictions)
         batch_loss = batch_loss + prediction_loss
         if t < self.args.time-2:
             attention_loss += (self.args.gamma**(self.args.time-t))*torch.log(attention_score)
     reward = calculate_reward(target, predictions)
     batch_loss = batch_loss-reward*attention_loss
     self.model.reset_attention()
     return batch_loss
Exemplo n.º 5
0
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None):
    print("PPO -- Training")

    env = make('hungry_geese')
    trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    memory = Memory()

    if load_model:
        agent.load_model_weights(actor_filename, critic_filename)
        agent.load_optimizer_weights(optimizer_filename)

    episode = 0
    start_episode = 0
    end_episode = 50000
    reward_threshold = None
    threshold_reached = False
    epochs = 4
    batch_size = 128
    current_frame = 0

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            current_frame += 1
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_action(state, training=True)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            memory.add(state, action, reward, next_state, float(done))

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

            if current_frame % batch_size == 0:
                for _ in range(epochs):
                    states, actions, rewards, next_states, dones = memory.get_all_samples()
                    agent.fit(states, actions, rewards, next_states, dones)
                memory.clear()
                agent.update_networks()

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if reward_threshold:
            if len(last_1000_ep_reward) == 1000:
                if np.mean(last_1000_ep_reward) >= reward_threshold:
                    print("You solved the task after" + str(episode) + "episodes")
                    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                             'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
                    threshold_reached = True
                    break

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_action(state)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                     'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5',
                             'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy')

    if threshold_reached:
        plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards)
    else:
        plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards)
    plt.title("Reward")
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Exemplo n.º 6
0
def ddqn_train(model_name,
               load_model=False,
               model_filename=None,
               optimizer_filename=None):
    print("DDQN -- Training")

    env = make('hungry_geese')
    trainer = env.train(
        ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = DDQNAgent(rows=11, columns=11, num_actions=3)
    buffer = ReplayBuffer()
    strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001)

    if load_model:
        agent.load_model_weights(model_filename)
        agent.load_optimizer_weights(optimizer_filename)

    start_episode = 0
    end_episode = 50000
    epochs = 32
    batch_size = 128

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        epsilon = strategy.get_epsilon(episode - start_episode)
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_epsilon_greedy_action(state, epsilon)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(
                env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            buffer.add(state, action, reward, next_state, done)

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

        if len(buffer) >= batch_size:
            for _ in range(epochs):
                states, actions, rewards, next_states, dones = buffer.get_samples(
                    batch_size)
                agent.fit(states, actions, rewards, next_states, dones)

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) +
              " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if episode % 10 == 0:
            agent.update_target_network()

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))
            print('Epsilon: ' + str(round(epsilon, 3)))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' +
                  str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                epsilon = 0
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_epsilon_greedy_action(state, epsilon)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(
                        env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ddqn_' + model_name + '_' +
                                     str(episode) + '.h5')
            agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                         str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ddqn_' + model_name + '_' +
                             str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                 str(end_episode) + '_optimizer.npy')

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             training_rewards)
    plt.title('Reward')
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()