Exemplo n.º 1
0
def post_evaluate(models_path, sigma, n_post_episodes=5, add_noise=False):
    # print('----------------Post evaluation----------------')

    policy_path = models_path + "_policy"
    value_path = models_path + "_value"

    if args.use_parameter_noise:
        policy_post = PolicyLayerNorm(num_inputs, num_actions)
        value_post = Value(num_inputs)

    else:
        policy_post = Policy(num_inputs, num_actions)
        value_post = Value(num_inputs)

    # print('------------------')
    value_post.load_state_dict(torch.load(value_path))
    policy_post.load_state_dict(torch.load(policy_path))

    reward_post = 0

    for i in range(n_post_episodes):
        state = env.reset()

        ##seeding
        # env.seed(i)
        # torch.manual_seed(i)

        # state = running_state(state)
        for t in range(1000):

            if args.use_parameter_noise and add_noise:
                action = select_action(policy_post,
                                       state,
                                       sigma,
                                       add_noise=True)

            else:
                action = select_action(policy_post, state)
            action = action.data[0].numpy()

            next_state, reward, done, _ = env.step(action)

            reward_post += reward

            # next_state = running_state(next_state)

            if done:
                break

            # state = running_state(next_state)
            state = next_state

    print('___Post evaluation reward___')
    print(reward_post / n_post_episodes)

    return reward_post / n_post_episodes
Exemplo n.º 2
0
    def insert_or_replace(self, session, key, value):
        entry = self.lookup_entry(session, key)

        if entry:
            # Update existing entry
            entry.value = self.lookup_value(session, value) or Value(
                hash=self.hash_value(value), blob=value)
        else:
            # Create new entry.
            entry = Entry(key=key,
                          value=self.lookup_value(session, value)
                          or Value(hash=self.hash_value(value), blob=value))
            session.add(entry)
Exemplo n.º 3
0
def update_or_create(testcase,
                     env,
                     build,
                     metric,
                     value=None,
                     comment=None,
                     color=None):
    """Update testresults/settings if exist, otherwise create new ones.

    :return created     True if created new results, otherwise False
    """

    settings = Settings.objects.get_or_create(testcase=testcase,
                                              metric=metric)[0]

    testresults, created = TestResults.objects.get_or_create(
        build=build,
        testcase=testcase,
        env=env,
        metric=metric,
        tag=gen_tag(build),
        settings=settings)
    testresults.timestamp = time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime())
    if value:
        v = Value(value=value)
        testresults.value_set.add(v)
    if comment:
        testresults.comment = comment
    if color:
        testresults.color = color
    testresults.save()

    return created
Exemplo n.º 4
0
def merge_person_property(dbsession, person, property, value, source):
    """Merge the given ``property`` with ``value`` into the ``person``. Attribute the change to the ``source``.
    ``value`` can be a string or a dictionary with keys "label", "lang", and "value". ``source`` is a dictionary
    with keys "label", "source", and "timestamp".
    """
    if isinstance(value, dict):
        label = value['label'] if 'label' in value else None
        lang = value['lang'] if 'lang' in value else None
        value = value['value']
    else:
        label = None
        lang = None
    db_property = dbsession.query(PersonProperty).join(Value).filter(and_(PersonProperty.person_id == person.id,
                                                                          PersonProperty.name == property,
                                                                          Value.value == value,
                                                                          Value.lang == lang)).first()
    if not db_property:
        db_value = dbsession.query(Value).filter(and_(Value.value == value,
                                                      Value.lang == lang)).first()
        if not db_value:
            db_value = Value(label=label, value=value, lang=lang)
            dbsession.add(db_value)
        db_property = PersonProperty(person=person, name=property, value=db_value, status='unconfirmed')
        dbsession.add(db_property)
    property_source = dbsession.query(PersonPropertySource).join(Source).filter(and_(PersonPropertySource.property == db_property,
                                                                                     Source.url == source['url'])).first()
    if not property_source:
        db_source = dbsession.query(Source).filter(Source.url == source['url']).first()
        if not db_source:
            db_source = Source(label=source['label'], url=source['url'])
            dbsession.add(db_source)
        property_source = PersonPropertySource(property=db_property, source=db_source, timestamp=source['timestamp'])
        dbsession.add(property_source)
    else:
        property_source.timestamp = source['timestamp']
        dbsession.add(property_source)
    dbsession.commit()
    return db_property
Exemplo n.º 5
0
def train(rank, params, shared_p, shared_v, optimizer_p, optimizer_v):
    torch.manual_seed(params.seed + rank)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    policy = Policy(num_inputs, num_outputs)
    value = Value(num_inputs)

    memory = ReplayMemory(1e6)
    batch_size = 10000

    state = env.reset()
    state = Variable(torch.Tensor(state).unsqueeze(0))
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        policy.load_state_dict(shared_p.state_dict())
        value.load_state_dict(shared_v.state_dict())

        w = -1
        while w < batch_size:
            states = []
            actions = []
            rewards = []
            values = []
            returns = []
            advantages = []

            # Perform K steps
            for step in range(params.num_steps):
                w += 1
                states.append(state)

                mu, sigma_sq = policy(state)
                eps = torch.randn(mu.size())
                action = (mu + sigma_sq.sqrt()*Variable(eps))
                actions.append(action)

                v = value(state)
                values.append(v)

                env_action = action.data.squeeze().numpy()
                state, reward, done, _ = env.step(env_action)
                done = (done or episode_length >= params.max_episode_length)
                reward = max(min(reward, 1), -1)
                rewards.append(reward)

                if done:
                    episode_length = 0
                    state = env.reset()

                state = Variable(torch.Tensor(state).unsqueeze(0))

                if done:
                    break

            R = torch.zeros(1, 1)
            if not done:
                v = value(state)
                R = v.data

            # compute returns and advantages:
            values.append(Variable(R))
            R = Variable(R)
            for i in reversed(range(len(rewards))):
                R = params.gamma * R + rewards[i]
                returns.insert(0, R)
                A = R - values[i]
                advantages.insert(0, A)

            # store usefull info:
            memory.push([states, actions, returns, advantages])

        batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(batch_size)

        # policy grad updates:
        mu_old, sigma_sq_old = policy(batch_states)
        probs_old = normal(batch_actions, mu_old, sigma_sq_old)
        policy_new = Policy(num_inputs, num_outputs)
        kl = 0.
        kl_coef = 1.
        kl_target = Variable(torch.Tensor([params.kl_target]))
        for m in range(100):
            policy_new.load_state_dict(shared_p.state_dict())
            mu_new, sigma_sq_new = policy_new(batch_states)
            probs_new = normal(batch_actions, mu_new, sigma_sq_new)
            policy_loss = torch.mean(batch_advantages * torch.sum(probs_new/probs_old,1))
            kl = torch.mean(probs_old * torch.log(probs_old/probs_new))
            kl_loss = kl_coef * kl + \
                params.ksi * torch.clamp(kl-2*kl_target, max=0)**2
            total_policy_loss = - policy_loss + kl_loss
            if kl > 4*kl_target:
                break
            # assynchronous update:
            optimizer_p.zero_grad()
            total_policy_loss.backward()
            ensure_shared_grads(policy_new, shared_p)
            optimizer_p.step()

        # value grad updates:
        for b in range(100):
            value.load_state_dict(shared_v.state_dict())
            v = value(batch_states)
            value_loss = torch.mean((batch_returns - v)**2)
            # assynchronous update:
            optimizer_v.zero_grad()
            value_loss.backward()
            ensure_shared_grads(value, shared_v)
            optimizer_v.step()

        if kl > params.beta_hight*kl_target:
            kl_coef *= params.alpha
        if kl < params.beta_low*kl_target:
            kl_coef /= params.alpha

        print("update done !")
Exemplo n.º 6
0
env = gym.make(args.env_name)

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

env.seed(args.seed)
torch.manual_seed(args.seed)

if args.use_joint_pol_val:
    ac_net = ActorCritic(num_inputs, num_actions)
    opt_ac = optim.Adam(ac_net.parameters(), lr=0.0003)
else:
    policy_net = GRU(num_inputs, num_actions)
    old_policy_net = GRU(num_inputs, num_actions)
    value_net = Value(num_inputs)
    opt_policy = optim.Adam(policy_net.parameters(), lr=0.0003)
    opt_value = optim.Adam(value_net.parameters(), lr=0.0003)


def create_batch_inputs(batch_states_list, batch_actions_list,
                        batch_advantages_list, batch_targets_list):
    lengths = []
    for states in batch_states_list:
        lengths.append(states.size(0))

    max_length = max(lengths)
    batch_states = torch.zeros(len(batch_states_list), max_length, num_inputs)
    batch_actions = torch.zeros(len(batch_actions_list), max_length,
                                num_actions)
    batch_advantages = torch.zeros(len(batch_advantages_list), max_length)
Exemplo n.º 7
0
def main(gamma=0.995, env_name='Walker2d-v2', tau=0.97, seed=543, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False, log_interval=1, entropy_coeff=0.0,\
        clip_epsilon=0.2, use_joint_pol_val=False):

    torch.set_default_tensor_type('torch.DoubleTensor')
    PI = torch.DoubleTensor([3.1415926])

    env = gym.make(env_name)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    env.seed(seed)
    torch.manual_seed(seed)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    opt_policy = optim.Adam(policy_net.parameters(), lr=0.001)
    opt_value = optim.Adam(value_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []
    plot_rew = []
    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps): # Don't infinite loop while learning
                action = select_action(state, policy_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        plot_rew.append(reward_batch)
        update_params(batch, policy_net, value_net, gamma, opt_policy, opt_value)

        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
    
    plot_epi = []
    for i in range (number_of_batches):
        plot_epi.append(i)
    trace = go.Scatter( x = plot_epi, y = plot_rew) 
    layout = go.Layout(title='PPO',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))

    plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')
Exemplo n.º 8
0
    def __init__(self,
                 args,
                 logger,
                 state_size=2,
                 action_size=4,
                 context_size=1,
                 num_goals=4,
                 history_size=1,
                 dtype=torch.FloatTensor):
        super(InfoGAIL, self).__init__(args,
                                       logger,
                                       state_size=state_size,
                                       action_size=action_size,
                                       context_size=context_size,
                                       num_goals=num_goals,
                                       history_size=history_size,
                                       dtype=dtype)

        # Create networks
        self.policy_net = Policy(state_size=state_size * history_size,
                                 action_size=0,
                                 latent_size=context_size,
                                 output_size=action_size,
                                 hidden_size=64,
                                 output_activation='sigmoid')
        self.old_policy_net = Policy(state_size=state_size * history_size,
                                     action_size=0,
                                     latent_size=context_size,
                                     output_size=action_size,
                                     hidden_size=64,
                                     output_activation='sigmoid')

        # Use value network for calculating GAE. We should use this for
        # training the policy network.
        if args.use_value_net:
            # context_size contains num_goals
            self.value_net = Value(state_size * history_size + context_size,
                                   hidden_size=64)

        # Reward net is the discriminator network. Discriminator does not
        # receive the latent vector in InfoGAIL.
        self.reward_net = Reward(
            state_size * history_size,
            action_size,  # action size
            0,  # latent size
            hidden_size=64)

        self.posterior_net = DiscretePosterior(
            state_size=state_size * history_size,  # state
            action_size=0,  # action
            latent_size=0,  # context
            hidden_size=64,
            output_size=num_goals)

        self.opt_policy = optim.Adam(self.policy_net.parameters(), lr=0.0003)
        self.opt_reward = optim.Adam(self.reward_net.parameters(), lr=0.0003)
        self.opt_value = optim.Adam(self.value_net.parameters(), lr=0.0003)
        self.opt_posterior = optim.Adam(self.posterior_net.parameters(),
                                        lr=0.0003)

        # Create loss functions
        self.criterion = nn.BCELoss()
        self.criterion_posterior = nn.CrossEntropyLoss()

        self.create_environment()
Exemplo n.º 9
0
def train(args):

    # Initialize data type
    dtype = torch.float32
    torch.set_default_dtype(dtype)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Initialize environment
    env = gym.make(args.env_id)
    envname = env.spec.id
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Initialize random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Initialize neural nets
    policy = GaussianPolicy(obs_dim, act_dim, args.hidden_size, args.activation, args.logstd)
    value_net = Value(obs_dim, args.hidden_size, args.activation)
    cvalue_net = Value(obs_dim, args.hidden_size, args.activation)
    policy.to(device)
    value_net.to(device)
    cvalue_net.to(device)

    # Initialize optimizer
    pi_optimizer = torch.optim.Adam(policy.parameters(), args.pi_lr)
    vf_optimizer = torch.optim.Adam(value_net.parameters(), args.vf_lr)
    cvf_optimizer = torch.optim.Adam(cvalue_net.parameters(), args.cvf_lr)

    # Initialize learning rate scheduler
    lr_lambda = lambda it: max(1.0 - it / args.max_iter_num, 0)
    pi_scheduler = torch.optim.lr_scheduler.LambdaLR(pi_optimizer, lr_lambda=lr_lambda)
    vf_scheduler = torch.optim.lr_scheduler.LambdaLR(vf_optimizer, lr_lambda=lr_lambda)
    cvf_scheduler = torch.optim.lr_scheduler.LambdaLR(cvf_optimizer, lr_lambda=lr_lambda)

    # Store hyperparameters for log
    hyperparams = vars(args)

    # Initialize RunningStat for state normalization, score queue, logger
    running_stat = RunningStats(clip=5)
    score_queue = deque(maxlen=100)
    cscore_queue = deque(maxlen=100)
    logger = Logger(hyperparams)

    # Get constraint bounds
    cost_lim = get_threshold(envname, constraint=args.constraint)

    # Initialize and train FOCOPS agent
    agent = FOCOPS(env, policy, value_net, cvalue_net,
                   pi_optimizer, vf_optimizer, cvf_optimizer,
                   args.num_epochs, args.mb_size,
                   args.c_gamma, args.lam, args.delta, args.eta,
                   args.nu, args.nu_lr, args.nu_max, cost_lim,
                   args.l2_reg, score_queue, cscore_queue, logger)

    start_time = time.time()

    for iter in range(args.max_iter_num):

        # Update iteration for model
        agent.logger.save_model('iter', iter)

        # Collect trajectories
        data_generator = DataGenerator(obs_dim, act_dim, args.batch_size, args.max_eps_len)
        rollout = data_generator.run_traj(env, agent.policy, agent.value_net, agent.cvalue_net,
                                          running_stat, agent.score_queue, agent.cscore_queue,
                                          args.gamma, args.c_gamma, args.gae_lam, args.c_gae_lam,
                                          dtype, device, args.constraint)

        # Update FOCOPS parameters
        agent.update_params(rollout, dtype, device)

        # Update learning rates
        pi_scheduler.step()
        vf_scheduler.step()
        cvf_scheduler.step()

        # Update time and running stat
        agent.logger.update('time', time.time() - start_time)
        agent.logger.update('running_stat', running_stat)

        # Save and print values
        agent.logger.dump()
Exemplo n.º 10
0
        self.num_steps = 20
        self.max_episode_length = 10000
        self.seed = 1
        self.env_name = 'Pendulum-v0'


if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    params = Params()
    torch.manual_seed(params.seed)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]

    shared_p = Policy(num_inputs, num_outputs)
    shared_v = Value(num_inputs)
    shared_p.share_memory()
    shared_v.share_memory()
    optimizer_p = my_optim.SharedAdam(shared_p.parameters(), lr=params.lr)
    optimizer_v = my_optim.SharedAdam(shared_v.parameters(), lr=params.lr)

    processes = []
    p = mp.Process(target=test, args=(params.num_processes, params, shared_p))
    p.start()
    processes.append(p)
    for rank in range(0, params.num_processes):
        p = mp.Process(target=train,
                       args=(rank, params, shared_p, shared_v, optimizer_p,
                             optimizer_v))
        p.start()
        processes.append(p)
Exemplo n.º 11
0
    def save(self, *args, **kwargs):
        # 如果EObject还不是数据库里存储的, 那么就先存储一下
        if self._eobject.pk == None:
            self._eobject.save()

        #特别需要注意的!!! 填写时,同一个项目在一个页面填写过一个field, 就不要让他在另外一个页面填写同一个field了 !!!!
        if self._group == None:  # 如果是新建group,就为之赋值一个 group,取值规则是max(现有groups) + 1
            try:
                self._group = max(self.eobject_eform_groups) + 1
            except ValueError:
                self._group = 1

        new_values = []
        for key, value in self.cleaned_data.items():
            if value == None: continue  # 非必填项,就跳过
            efield = self._key_field_dict[key]
            if efield.field_type in EField.MULT_CHOICES_FIELD:
                for item in value:
                    new_values.append(
                        Value(eobject=self._eobject,
                              efield=efield,
                              value=efield.get_db_value(item),
                              group=self._group))
            elif efield.field_type == u'SimpleModelChoiceField':
                content_type_id, object_id = value.split("-")[0], value.split(
                    "-")[1]
                new_values.append(
                    Value(eobject=self._eobject,
                          efield=efield,
                          content_type=ContentType.objects.get(
                              pk=content_type_id),
                          object_id=object_id,
                          group=self._group))

            elif efield.field_type in [u"VideoField", u'FileField']:
                new_values.append(
                    Value(eobject=self._eobject,
                          efield=efield,
                          vfile=value,
                          group=self._group))
            elif efield.field_type == u"ImageField":
                commonImage = CommonImage.objects.create(
                    image=value) if isinstance(value, UploadedFile) else value
                new_values.append(
                    Value(eobject=self._eobject,
                          efield=efield,
                          content_type=ContentType.objects.get_for_model(
                              commonImage),
                          object_id=commonImage.id,
                          group=self._group))
            else:
                new_values.append(
                    Value(eobject=self._eobject,
                          efield=efield,
                          value=efield.get_db_value(value),
                          group=self._group))

        self.eobject_values.filter(
            group=self._group, eobject=self._eobject
        ).delete(
        )  # 初始值可以是其他的 eobject的, 所以这个删除必须加上  eobject.self._eobject, 否则可能将别人的数值删除
        Value.objects.bulk_create(new_values)
        return self._eobject
Exemplo n.º 12
0
env = gym.make(args.env_name)

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

env.seed(args.seed)
torch.manual_seed(args.seed)

if args.use_joint_pol_val:
    ac_net = ActorCritic(num_inputs, num_actions)
    opt_ac = optim.Adam(ac_net.parameters(), lr=0.0003)
else:
    policy_net = GRU(num_inputs, num_actions, dtype=dtype).type(dtype)
    old_policy_net = GRU(num_inputs, num_actions, dtype=dtype).type(dtype)
    value_net = Value(num_inputs).type(dtype)
    reward_net = GRU(num_inputs + num_actions,
                     1,
                     policy_flag=0,
                     activation_flag=2,
                     dtype=dtype).type(dtype)
    opt_policy = optim.Adam(policy_net.parameters(), lr=0.0003)
    opt_value = optim.Adam(value_net.parameters(), lr=0.0003)
    opt_reward = optim.Adam(reward_net.parameters(), lr=0.0003)


def create_batch_inputs(batch_states_list,
                        batch_actions_list,
                        batch_advantages_list=None):
    lengths = []
    for states in batch_states_list: