Пример #1
0
    def master_update(self):
        # Receive gradient from a worker
        t1 = time.time()
        ##
        update_info = self.comm.recv(source=MPI.ANY_SOURCE,
                                     tag=TAG_UPDATE_START,
                                     status=self.status)
        worker_source = self.status.Get_source()
        ##
        t2 = time.time()
        t = t2 - t1
        dh_logger.info(
            jm(type='receive_gradient',
               rank=self.comm.Get_rank(),
               duration=t,
               start_time=t1,
               end_time=t2,
               rank_worker_source=worker_source))

        t1 = time.time()
        ##
        workerg = update_info['workerg']
        stepsize = update_info['stepsize']
        if self.scale_grad_by_procs:
            workerg /= self.comm.Get_size() - 1  # one is the parameter server

        self.t += 1
        a = stepsize * np.sqrt(1 - self.beta2**self.t) / (1 -
                                                          self.beta1**self.t)
        self.m = self.beta1 * self.m + (1 - self.beta1) * workerg
        self.v = self.beta2 * self.v + (1 - self.beta2) * (workerg * workerg)
        step = (-a) * self.m / (np.sqrt(self.v) + self.epsilon)
        update_vars = self.getflat() + step
        self.setfromflat(update_vars)
        ##
        t2 = time.time()
        t = t2 - t1
        dh_logger.info(
            jm(type='update_parameters',
               rank=self.comm.Get_rank(),
               duration=t,
               start_time=t1,
               end_time=t2,
               rank_worker_source=worker_source))

        t1 = time.time()
        ##
        self.comm.send(update_vars, dest=worker_source, tag=TAG_UPDATE_DONE)
        ##
        t2 = time.time()
        t = t2 - t1
        dh_logger.info(
            jm(type='send_parameters',
               rank=self.comm.Get_rank(),
               duration=t,
               start_time=t1,
               end_time=t2,
               rank_worker_dest=worker_source))

        return worker_source
Пример #2
0
    def worker_update(self, localg, stepsize):
        # Send local gradient to master
        update_info = dict(workerg=localg, stepsize=stepsize)

        t1 = time.time()
        ##
        self.comm.send(update_info, dest=0, tag=TAG_UPDATE_START)
        update_vars = self.comm.recv(source=0,
                                     tag=TAG_UPDATE_DONE,
                                     status=self.status)
        ##
        t2 = time.time()
        t = t2 - t1
        dh_logger.info(
            jm(type='receive_parameters',
               rank=self.comm.Get_rank(),
               duration=t,
               start_time=t1,
               end_time=t2,
               master_rank=0))

        t1 = time.time()
        ##
        self.setfromflat(update_vars)
        ##
        t2 = time.time()
        t = t2 - t1
        dh_logger.info(
            jm(type='setfromflat',
               rank=self.comm.Get_rank(),
               duration=t,
               start_time=t1,
               end_time=t2,
               master_rank=0))
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        if num_epochs > 0:
            min_mse = math.inf
            for i in range(num_epochs):
                self.model.fit(self.dataset_train,
                               epochs=1,
                               steps_per_epoch=self.train_steps_per_epoch,
                               callbacks=self.callbacks)

                y_orig, y_pred = self.predict()

                try:
                    unnormalize_mse = mean_squared_error(y_orig, y_pred)
                except ValueError as err:
                    logger.error(traceback.format_exc())
                    unnormalize_mse = np.finfo('float32').max
                except:
                    raise

                # self.train_history[f'{self.metrics_name[0]}_valid'] = unnormalize_mse

                min_mse = min(min_mse, unnormalize_mse)
                logger.info(jm(epoch=i, validation_mse=float(unnormalize_mse)))

            logger.info(jm(type='result', mse=float(min_mse)))
            return min_mse
        elif num_epochs == 0:
            y_orig, y_pred = self.predict()

            try:
                unnormalize_mse = mean_squared_error(y_orig, y_pred)
            except ValueError as err:
                logger.error(traceback.format_exc())
                unnormalize_mse = np.finfo('float32').max
            except:
                raise

            logger.info(jm(epoch=0, validation_mse=float(unnormalize_mse)))
            logger.info(jm(type='result', mse=float(unnormalize_mse)))
            return unnormalize_mse
        else:
            raise RuntimeError(
                f'Number of epochs should be >= 0: {num_epochs}')
Пример #4
0
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        if num_epochs > 0:
            max_acc = 0
            for i in range(num_epochs):
                self.model.fit(self.dataset_train,
                               epochs=1,
                               steps_per_epoch=self.train_steps_per_epoch,
                               callbacks=self.callbacks)

                valid_info = self.model.evaluate(
                    self.dataset_valid, steps=self.valid_steps_per_epoch)

                valid_loss, valid_acc = valid_info[0], valid_info[1] * 100

                max_acc = max(max_acc, valid_acc)
                logger.info(
                    jm(epoch=i,
                       validation_loss=valid_loss,
                       validation_acc=float(valid_acc)))
            logger.info(jm(type='result', acc=float(max_acc)))
            return max_acc
        elif num_epochs == 0:
            valid_info = self.model.evaluate(self.dataset_valid,
                                             steps=self.valid_steps_per_epoch)

            valid_loss, valid_acc = valid_info[0], valid_info[1] * 100

            logger.info(
                jm(epoch=0,
                   validation_loss=valid_loss,
                   validation_acc=float(valid_acc)))
            logger.info(jm(type='result', acc=float(valid_acc)))
            return valid_acc
        else:
            raise RuntimeError(
                f'Number of epochs should be >= 0: {num_epochs}')
Пример #5
0
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch):

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:  # rank zero simule the use of a parameter server
        pass
    else:
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
        ) if seed is not None else None
        set_global_seeds(workerseed)

        # MAKE ENV_NAS
        cs_kwargs = space['create_structure'].get('kwargs')
        if cs_kwargs is None:
            structure = space['create_structure']['func']()
        else:
            structure = space['create_structure']['func'](**cs_kwargs)

        num_nodes = structure.num_nodes
        timesteps_per_actorbatch = num_nodes * num_episodes_per_batch
        num_timesteps = timesteps_per_actorbatch * num_episodes

        max_timesteps = num_timesteps
        timesteps_per_actorbatch = timesteps_per_actorbatch

        env = NasEnv(space, evaluator, structure)

        seg_gen = traj_segment_generator(env, timesteps_per_actorbatch)

        timesteps_so_far = 0
        iters_so_far = 0

        cond = sum([max_timesteps > 0])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_timesteps={max_timesteps}"

        while True:
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            dh_logger.info(
                jm(type='seg', rank=MPI.COMM_WORLD.Get_rank(), **seg))
            iters_so_far += 1

        env.close()
Пример #6
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    reward_rule=reward_for_final_timestep):

    rank = MPI.COMM_WORLD.Get_rank()

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdamAsync(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    t1 = time.time()
    ##
    adam.sync()
    ##
    t2 = time.time()
    t = t2 - t1
    dh_logger.info(
        jm(type='adam.sync', rank=rank, duration=t, start_time=t1,
           end_time=t2))

    if rank == 0:  # ZERO is the parameter server
        while True:
            t1 = time.time()
            ## BEGIN - TIMING ##
            rank_worker_source = adam.master_update()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='adam.master_update',
                   rank=rank,
                   duration=t,
                   rank_worker_source=rank_worker_source,
                   start_time=t1,
                   end_time=t2))
    else:
        # Prepare for rollouts
        # ----------------------------------------

        seg_gen = traj_segment_generator(pi,
                                         env,
                                         timesteps_per_actorbatch,
                                         stochastic=True,
                                         reward_affect_func=reward_rule)

        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

        cond = sum([
            max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0
        ])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_iters={max_iters}, max_timesteps={max_timesteps}, max_episodes={max_episodes}, max_seconds={max_seconds}"

        while True:
            if callback: callback(locals(), globals())
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break
            elif max_episodes and episodes_so_far >= max_episodes:
                break
            elif max_iters and iters_so_far >= max_iters:
                break
            elif max_seconds and time.time() - tstart >= max_seconds:
                break

            if schedule == 'constant':
                cur_lrmult = 1.0
            elif schedule == 'linear':
                cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps,
                                 0)
            else:
                raise NotImplementedError

            #logger.log("********** Iteration %i ************"%iters_so_far)

            t1 = time.time()
            ## BEGIN - TIMING ##
            seg = seg_gen.__next__()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='batch_computation',
                   rank=rank,
                   duration=t,
                   start_time=t1,
                   end_time=t2))
            dh_logger.info(jm(type='seg', rank=rank, **seg))

            add_vtarg_and_adv(seg, gamma, lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            # optim_batchsize = optim_batchsize or ob.shape[0]
            optim_batchsize = ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            dh_logger.info(f"Rank={rank}: Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)

                    t1 = time.time()
                    ## BEGIN - TIMING ##
                    adam.worker_update(g, optim_stepsize * cur_lrmult)
                    ## END - TIMING ##
                    t2 = time.time()
                    t = t2 - t1
                    dh_logger.info(
                        jm(type='adam.worker_update',
                           rank=rank,
                           duration=t,
                           start_time=t1,
                           end_time=t2))

                    losses.append(newlosses)

            dh_logger.info(f"Rank={rank}: Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0, use_mpi=False)

            lens = seg["ep_lens"]
            rews = seg["ep_rets"]

            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1

        return pi
Пример #7
0
def learn(env, policy_fn, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        reward_rule=reward_for_final_timestep
        ):

    rank = MPI.COMM_WORLD.Get_rank()

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    input_c_vf = U.get_placeholder_cached(name="c_vf")
    input_h_vf = U.get_placeholder_cached(name="h_vf")
    input_c_pol = U.get_placeholder_cached(name="c_pol")
    input_h_pol = U.get_placeholder_cached(name="h_pol")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function(
        [ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol],
        losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator_ph(pi, env, timesteps_per_actorbatch,
        stochastic=True, reward_affect_func=reward_rule)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert max_timesteps > 0, f"The number of timesteps should be > 0 but is {max_timesteps}"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        dh_logger.info(jm(type='seg', rank=rank, **seg))
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        c_vf = np.squeeze(np.array([c for c, _ in seg["hs_vf"]]))
        h_vf = np.squeeze(np.array([h for _, h in seg["hs_vf"]]))
        c_pol = np.squeeze(np.array([c for c, _ in seg["hs_pol"]]))
        h_pol = np.squeeze(np.array([h for _, h in seg["hs_pol"]]))
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, c_vf=c_vf, h_vf=h_vf, c_pol=c_pol, h_pol=h_pol), shuffle=not pi.recurrent)
        # optim_batchsize = optim_batchsize or ob.shape[0]
        optim_batchsize = ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            gradients = []
            for batch in d.iterate_once(optim_batchsize):
                for i in range(len(batch["ob"])):
                    *newlosses, g = lossandgrad(
                        batch["ob"][i:i+1],
                        batch["ac"][i:i+1],
                        batch["atarg"][i:i+1],
                        batch["vtarg"][i:i+1],
                        cur_lrmult,
                        batch["c_vf"][i:i+1],
                        batch["h_vf"][i:i+1],
                        batch["c_pol"][i:i+1],
                        batch["h_pol"][i:i+1])
                    losses.append(newlosses)
                    gradients.append(g)
            g = np.array(gradients).sum(0)
            adam.update(g, optim_stepsize * cur_lrmult)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            for i in range(len(batch["ob"])):
                newlosses = compute_losses(
                    batch["ob"][i:i+1],
                    batch["ac"][i:i+1],
                    batch["atarg"][i:i+1],
                    batch["vtarg"][i:i+1],
                    cur_lrmult,
                    batch["c_vf"][i:i+1],
                    batch["h_vf"][i:i+1],
                    batch["c_pol"][i:i+1],
                    batch["h_pol"][i:i+1])
                losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

    return pi