Пример #1
0
    def rollout(self, params, render=False):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs.transpose(1, 2,
                                          0)).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            action = action.item()
            if action < .33 and action > -.33:  # Don't move.
                action = np.array([False, False])
            elif action < -.33:  # Move left.
                action = np.array([True, False])
            else:  # Move right.
                action = np.array([False, True])
            obs, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return -cumulative
            i += 1
Пример #2
0
def load_ckpt(model, optimizer, ckpt_path, load_model=False, load_opt=False, load_misc=False, is_freezeD=False):
    ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
    if load_model:
        if is_freezeD:
            mismatch_names = misc.load_parameters(src=ckpt["state_dict"],
                                                  dst=model.state_dict(),
                                                  strict=False)
            print("The following parameters/buffers do not match with the ones of the pre-trained model:", mismatch_names)
        else:
            model.load_state_dict(ckpt["state_dict"], strict=False)

    if load_opt:
        optimizer.load_state_dict(ckpt["optimizer"])
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.cuda()

    if load_misc:
        seed = ckpt["seed"]
        run_name = ckpt["run_name"]
        step = ckpt["step"]
        try:
            aa_p = ckpt["aa_p"]
        except:
            aa_p = ckpt["ada_p"]
        best_step = ckpt["best_step"]
        best_fid = ckpt["best_fid"]

        try:
            epoch = ckpt["epoch"]
        except:
            epoch = 0
        try:
            topk = ckpt["topk"]
        except:
            topk = "initialize"
        try:
            best_ckpt_path = ckpt["best_fid_checkpoint_path"]
        except:
            best_ckpt_path = ckpt["best_fid_ckpt"]
        return seed, run_name, step, epoch, topk, aa_p, best_step, best_fid, best_ckpt_path
Пример #3
0
def train_explorer(logdir,
                   epochs=10,
                   n_samples=4,
                   pop_size=4,
                   display=True,
                   max_workers=10):
    results = {}
    results['best'] = []
    # multiprocessing variables
    num_workers = min(max_workers, n_samples * pop_size)
    time_limit = 1000

    # create tmp dir if non existent and clean it if existent
    tmp_dir = join(logdir, 'tmp_exp')
    if not exists(tmp_dir):
        mkdir(tmp_dir)
    else:
        for fname in listdir(tmp_dir):
            unlink(join(tmp_dir, fname))

    # create exp dir if non exitent
    explore_dir = join(logdir, 'explore')
    if not exists(explore_dir):
        mkdir(explore_dir)

    ################################################################################
    #                           Thread routines                                    #
    ################################################################################
    def slave_routine(p_queue, r_queue, e_queue, p_index):
        """ Thread routine.

        Threads interact with p_queue, the parameters queue, r_queue, the result
        queue and e_queue the end queue. They pull parameters from p_queue, execute
        the corresponding rollout, then place the result in r_queue.

        Each parameter has its own unique id. Parameters are pulled as tuples
        (s_id, params) and results are pushed as (s_id, result).  The same
        parameter can appear multiple times in p_queue, displaying the same id
        each time.

        As soon as e_queue is non empty, the thread terminate.

        When multiple gpus are involved, the assigned gpu is determined by the
        process index p_index (gpu = p_index % n_gpus).

        :args p_queue: queue containing couples (s_id, parameters) to evaluate
        :args r_queue: where to place results (s_id, results)
        :args e_queue: as soon as not empty, terminate
        :args p_index: the process index
        """
        # init routine
        gpu = p_index % torch.cuda.device_count()
        device = torch.device(
            'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu')

        # redirect streams
        sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a')
        sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a')

        # with torch.no_grad():
        #     r_gen = RolloutGenerator(logdir, device, time_limit)

        #     while e_queue.empty():
        #         if p_queue.empty():
        #             sleep(.1)
        #         else:
        #             s_id, params = p_queue.get()
        #             r_queue.put((s_id, r_gen.rollout(params)))

        with torch.no_grad():
            r_gen = RolloutGenerator(logdir, device, time_limit)

            while e_queue.empty():
                if p_queue.empty():
                    sleep(.1)
                else:
                    s_id, params = p_queue.get()
                    r_queue.put((s_id, r_gen.rollout(params)))

    ################################################################################
    #                Define queues and start workers                               #
    ################################################################################
    p_queue = Queue()
    r_queue = Queue()
    e_queue = Queue()

    for p_index in range(num_workers):
        Process(target=slave_routine,
                args=(p_queue, r_queue, e_queue, p_index)).start()

    ################################################################################
    #                           Evaluation                                         #
    ################################################################################
    def evaluate(solutions, results, rollouts=100):
        """ Give current controller evaluation.

        Evaluation is minus the cumulated reward averaged over rollout runs.

        :args solutions: CMA set of solutions
        :args results: corresponding results
        :args rollouts: number of rollouts

        :returns: minus averaged cumulated reward
        """
        index_min = np.argmin(results)
        best_guess = solutions[index_min]
        restimates = []

        for s_id in range(rollouts):
            p_queue.put((s_id, best_guess))

        print("Evaluating...")
        for _ in tqdm(range(rollouts)):
            while r_queue.empty():
                sleep(.1)
            restimates.append(r_queue.get()[1])

        return best_guess, np.mean(restimates), np.std(restimates)

    ################################################################################
    #                           Launch CMA                                         #
    ################################################################################
    controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

    # define current best and load parameters
    cur_best = None
    ctrl_file = join(explore_dir, 'best.tar')
    print("Attempting to load previous best...")
    if exists(ctrl_file):
        state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
        cur_best = -state['reward']
        controller.load_state_dict(state['state_dict'])
        print("Previous best was {}...".format(-cur_best))

    parameters = controller.parameters()
    es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                                  {'popsize': pop_size})

    epoch = 0
    log_step = 3
    while not es.stop():
        if cur_best is not None and -cur_best > target_return:
            print("Already better than target, breaking...")
            break

        r_list = [0] * pop_size  # result list
        solutions = es.ask()

        # push parameters to queue
        for s_id, s in enumerate(solutions):
            for _ in range(n_samples):
                p_queue.put((s_id, s))

        # retrieve results
        if display:
            pbar = tqdm(total=pop_size * n_samples)
        for _ in range(pop_size * n_samples):
            while r_queue.empty():
                sleep(.1)
            r_s_id, r = r_queue.get()
            r_list[r_s_id] += r / n_samples
            if display:
                pbar.update(1)
        if display:
            pbar.close()

        es.tell(solutions, r_list)
        es.disp()

        # evaluation and saving
        if epoch % log_step == log_step - 1:
            best_params, best, std_best = evaluate(solutions, r_list)

            # log the best
            results['best'].append(best)

            print("Current evaluation: {}".format(best))
            if not cur_best or cur_best > best:
                cur_best = best
                print("Saving new best with value {}+-{}...".format(
                    -cur_best, std_best))
                load_parameters(best_params, controller)
                torch.save(
                    {
                        'epoch': epoch,
                        'reward': -cur_best,
                        'state_dict': controller.state_dict()
                    }, join(explore_dir, 'best.tar'))

            if -best > target_return:
                print(
                    "Terminating controller training with value {}...".format(
                        best))
                break

        epoch += 1

    es.result_pretty()
    e_queue.put('EOP')

    return results
Пример #4
0
            pbar.update(1)
    if args.display:
        pbar.close()
    count_g += 1
    es.tell(solutions, r_list)
    es.disp()

    # evaluation and saving
    if epoch % log_step == log_step - 1:
        best_params, best, std_best = evaluate(solutions, r_list)
        print("Current evaluation: {}".format(best))
        if not cur_best or cur_best > best:
            cur_best = best
            print("Saving new best with value {}+-{}...".format(
                -cur_best, std_best))
            load_parameters(best_params, controller)
            torch.save(
                {
                    'epoch': epoch,
                    'reward': -cur_best,
                    'state_dict': controller.state_dict()
                }, join(ctrl_dir, 'best.tar'))
        if -best > args.target_return:
            print("Terminating controller training with value {}...".format(
                best))
            break
    torch.save(
        {
            'epoch': epoch,
            'reward': -cur_best,
            'state_dict': controller.state_dict()
Пример #5
0
        if args.display:
            pbar.update(1)
    if args.display:
        pbar.close()

    es.tell(solutions, r_list)
    es.disp()

    # evaluation and saving
    if epoch % log_step == log_step - 1:
        best_params, best, std_best = evaluate(solutions, r_list)
        print("Current evaluation: {}".format(best))
        if not cur_best or cur_best > best:
            cur_best = best
            print("Saving new best with value {}+-{}...".format(-cur_best, std_best))
            load_parameters(best_params, controller)
            torch.save(
                {'epoch': epoch,
                 'reward': - cur_best,
                 'state_dict': controller.state_dict()},
                join(ctrl_dir, 'best.tar'))
        if - best > args.target_return:
            print("Terminating controller training with value {}...".format(best))
            break


    epoch += 1

es.result_pretty()
e_queue.put('EOP')
Пример #6
0
def run(args):
    p_queue = Queue()
    r_queue = Queue()
    e_queue = Queue()

    latent = 32
    mixture = 256
    size = latent + mixture
    controller = Controller(size, 3)

    for i in range(args.max_workers):
        Process(target=slave_routine,
                args=(p_queue, r_queue, e_queue, i, args.logdir)).start()

    cur_best = None
    savefile = args.logdir/'best.tar'
    if savefile.exists():
        print(f'Loading from {savefile}')
        state = torch.load(savefile.as_posix(), map_location={'cuda:0': 'cpu'})
        cur_best = -state['reward']
        controller.load_state_dict(state['state_dict'])


    parameters = controller.parameters()
    sigma = 0.1
    es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), sigma,
                                  {'popsize': args.pop_size})

    epoch = 0
    while not es.stop():
        if cur_best is not None and -cur_best > args.target_return:
            print('Already better than target, breaking...')
            break

        r_list = [0] * args.pop_size  # result list
        solutions = es.ask()

        # push parameters to queue
        for s_id, s in enumerate(solutions):
            for _ in range(args.n_samples):
                p_queue.put((s_id, s))

        # Retrieve results
        if args.display:
            pbar = tqdm(total=args.pop_size * args.n_samples)
        for _ in range(args.pop_size * args.n_samples):
            while r_queue.empty():
                sleep(.1)
            r_s_id, r = r_queue.get()
            r_list[r_s_id] += r / args.n_samples
            if args.display:
                pbar.update(1)
        if args.display:
            pbar.close()

        es.tell(solutions, r_list)
        es.disp()

        # CMA-ES seeks to minimize, so we want to multiply the reward we
        # get in a rollout by -1.

        best_params, best, std_best = evaluate(solutions, r_list, p_queue,
                                               r_queue)
        if (not cur_best) or (cur_best > best):
            cur_best = best
            print(f'Saving new best with value {-cur_best}+{-std_best}')
            load_parameters(best_params, controller)
            torch.save({'epoch': epoch,
                        'reward': -cur_best,
                        'state_dict': controller.state_dict()},
                       savefile)
            # Save after every epoch
            torch.save(controller.state_dict(), f'{controller_pt}')
        if -best > args.target_return:
            print(f'Terminating controller training with value {best}...')
            break
        epoch += 1

    es.result_pretty()
    e_queue.put('EOP')
Пример #7
0
def controller_train_proc(ctrl_dir,
                          controller,
                          vae,
                          mdrnn,
                          target_return=950,
                          skip_train=False,
                          display=True):
    step_log('4-2. controller_train_proc START!!')
    # define current best and load parameters
    cur_best = None
    if not os.path.exists(ctrl_dir):
        os.mkdir(ctrl_dir)
    ctrl_file = os.path.join(ctrl_dir, 'best.tar')

    p_queue = Queue()
    r_queue = Queue()
    #e_queue = Queue()   # pipaek : not necessary if not multiprocessing

    print("Attempting to load previous best...")
    if os.path.exists(ctrl_file):
        #state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
        state = torch.load(ctrl_file)
        cur_best = -state['reward']
        controller.load_state_dict(state['state_dict'])
        print("Previous best was {}...".format(-cur_best))

    if skip_train:
        return  # pipaek : 트레이닝을 통한 모델 개선을 skip하고 싶을 때..

    def evaluate(solutions,
                 results,
                 rollouts=100):  # pipaek : rollout 100 -> 10 , originally 100
        """ Give current controller evaluation.

        Evaluation is minus the cumulated reward averaged over rollout runs.

        :args solutions: CMA set of solutions
        :args results: corresponding results
        :args rollouts: number of rollouts

        :returns: minus averaged cumulated reward
        """
        index_min = np.argmin(results)
        best_guess = solutions[index_min]
        restimates = []

        for s_id in range(rollouts):
            print('p_queue.put(), s_id=%d' % s_id)
            p_queue.put((s_id, best_guess))
            print('>>>rollout_routine!!')
            rollout_routine()  # pipaek : 여기서도 p_queue.put 하자마자 바로 처리..

        print(">>>Evaluating...")
        for _ in tqdm(range(rollouts)):
            #while r_queue.empty():
            #    sleep(.1)   # pipaek : multi-process가 아니므로
            if not r_queue.empty(
            ):  # pipaek : 20180718 r_queue.get()에서 stuck되어 있는 것을 방지하기 위해 체크!!
                #print('r_queue.get()')
                #restimates.append(r_queue.get()[1])
                r_s_id, r = r_queue.get()
                print(
                    'in evaluate r_queue.get() r_s_id=%d, r_queue remain=%d' %
                    (r_s_id, r_queue.qsize()))
                restimates.append(r)
            else:
                print('r_queue.empty() -> break!!')
                break

        return best_guess, np.mean(restimates), np.std(restimates)

    def rollout_routine():
        """ Thread routine.

        Threads interact with p_queue, the parameters queue, r_queue, the result
        queue and e_queue the end queue. They pull parameters from p_queue, execute
        the corresponding rollout, then place the result in r_queue.

        Each parameter has its own unique id. Parameters are pulled as tuples
        (s_id, params) and results are pushed as (s_id, result).  The same
        parameter can appear multiple times in p_queue, displaying the same id
        each time.

        As soon as e_queue is non empty, the thread terminate.

        When multiple gpus are involved, the assigned gpu is determined by the
        process index p_index (gpu = p_index % n_gpus).

        :args p_queue: queue containing couples (s_id, parameters) to evaluate
        :args r_queue: where to place results (s_id, results)
        :args e_queue: as soon as not empty, terminate
        :args p_index: the process index
        """
        # init routine
        #gpu = p_index % torch.cuda.device_count()
        #device = torch.device('cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu')

        # redirect streams
        #if not os.path.exists(tmp_dir):
        #    os.mkdir(tmp_dir)

        #sys.stdout = open(os.path.join(tmp_dir, 'rollout.out'), 'a')
        #sys.stderr = open(os.path.join(tmp_dir, 'rollout.err'), 'a')

        with torch.no_grad():
            r_gen = RolloutGenerator(vae, mdrnn, controller, device,
                                     rollout_time_limit)

            while not p_queue.empty():
                print('in rollout_routine, p_queue.get()')
                s_id, params = p_queue.get()
                print('r_queue.put() sid=%d' % s_id)
                r_queue.put((s_id, r_gen.rollout(params)))
                print('r_gen.rollout OK, r_queue.put()')
                #r_queue.qsize()

    parameters = controller.parameters()
    es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                                  {'popsize': C_POP_SIZE})
    print("CMAEvolutionStrategy start OK!!")

    epoch = 0
    log_step = 3
    while not es.stop():
        print("--------------------------------------")
        print("CURRENT EPOCH = %d" % epoch)
        if cur_best is not None and -cur_best > target_return:
            print("Already better than target, breaking...")
            break

        r_list = [0] * C_POP_SIZE  # result list
        solutions = es.ask()
        print("CMAEvolutionStrategy-ask")

        # push parameters to queue
        for s_id, s in enumerate(
                solutions):  # pipaek : 이 for가 C_POP_SIZE 만큼 반복된다.
            #for _ in range(C_POP_SIZE * C_N_SAMPLES):
            for _ in range(C_N_SAMPLES):
                print('in controller_train_proc p_queue.put() s_id : %d' %
                      s_id)
                p_queue.put((s_id, s))
                #print("p_queue.put %d" % s_id)
                rollout_routine(
                )  # pipaek : p_queue.put 하자마자 바로 get해서 rollout하고 나서 r_queue에 결과 입력.
                print("rollout_routine OK, r_queue size=%d" % r_queue.qsize())

        # retrieve results
        if display:
            pbar = tqdm(total=C_POP_SIZE * C_N_SAMPLES)
        #for idx in range(C_POP_SIZE * C_N_SAMPLES):
        while not r_queue.empty(
        ):  # pipaek : 20180718 여기서 r_queue.get을 못해서 영원히 걸려있는 상태를 방지하기 위해 for문을 while문으로 바꾼다.
            #while r_queue.empty():
            #    sleep(.1)
            try:
                r_s_id, r = r_queue.get()
                print(
                    'in controller_train_proc r_queue.get() r_s_id=%d, r_queue remain=%d'
                    % (r_s_id, r_queue.qsize()))
                r_list[r_s_id] += r / C_N_SAMPLES
                if display:
                    pbar.update(1)
            except IndexError as err:
                print('IndexError during r_queue.get()')
                print('cur r_list size:%d, index:%d' % (len(r_list), r_s_id))
        if display:
            pbar.close()

        es.tell(solutions,
                r_list)  # pipaek : solution array에다가 r_list 결과를 업데이트..
        es.disp()

        # evaluation and saving
        if epoch % log_step == log_step - 1:
            print(">>>> TRYING EVALUATION, CURRENT EPOCH = %d" % epoch)
            best_params, best, std_best = evaluate(
                solutions, r_list, rollouts=100
            )  # pipaek : evaluate을 위해서 rollout은 10번만 하자.. originally 100
            print("Current evaluation: {}".format(best))
            if not cur_best or cur_best > best:
                cur_best = best
                print("Saving new best with value {}+-{}...".format(
                    -cur_best, std_best))
                load_parameters(best_params, controller)
                torch.save(
                    {
                        'epoch': epoch,
                        'reward': -cur_best,
                        'state_dict': controller.state_dict()
                    }, os.path.join(ctrl_dir, 'best.tar'))
            if -best > target_return:
                print(
                    "Terminating controller training with value {}...".format(
                        best))
                break

        epoch += 1

    print("es.stop!!")
    es.result_pretty()