Пример #1
0
def main(args):
    args.batch_size = None
    logger.debug('CONFIGURATION: {}'.format(args))
    """ Set up the graph, the agents, and run the agents in parallel. """
    if args.env == 'GYM':
        from environments import atari_environment
        num_actions = atari_environment.get_num_actions(args.game)
        input_shape = atari_environment.get_input_shape(args.game)
    else:
        num_actions = get_num_actions(args.rom_path, args.game)

    args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format(
        args.game, time.strftime('%m.%d/%H.%M'))
    logger.info('logging summaries to {}'.format(args.summ_base_dir))

    Learner, Network = ALGORITHMS[args.alg_type]
    network = Network({
        'name': 'shared_vars_network',
        'input_shape': input_shape,
        'num_act': num_actions,
        'args': args
    })

    #initialize shared variables
    args.learning_vars = SharedVars(network)

    args.opt_state = SharedVars(
        network, opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None

    args.batch_opt_state = SharedVars(
        network, opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None

    if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']:
        args.target_vars = SharedVars(network)
        args.target_update_flags = SharedFlags(args.num_actor_learners)

    args.barrier = Barrier(args.num_actor_learners)
    args.episode_counter = SharedCounter(0)
    args.global_step = SharedCounter(0)
    args.num_actions = num_actions

    #spin up processes and block
    if (args.visualize == 2): args.visualize = 0
    actor_learners = []
    experience_queue = Queue()
    for i in xrange(args.num_actor_learners):
        if (args.visualize == 2) and (i == args.num_actor_learners - 1):
            args.args.visualize = 1

        args.actor_id = i

        rng = np.random.RandomState(int(time.time()))
        args.random_seed = rng.randint(1000)

        #pass in gpu name to learner here and wrap each learner in device context
        args.queue = experience_queue  #only used by TRPO
        args.input_shape = input_shape
        actor_learners.append(Learner(args))
        actor_learners[-1].start()

    for t in actor_learners:
        t.join()

    logger.info('All training threads finished!')
Пример #2
0
    args = parser.parse_args()
    setup_loggings(args)
    torch.manual_seed(args.seed)

    env = create_atari_env(args.env_name)
    shared_model = ActorCritic(env.observation_space.shape[0],
                               env.action_space)
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    gl_step_cnt = SharedCounter()

    if not args.debug:
        processes = []

        p = mp.Process(target=test,
                       args=(args.num_processes, args, shared_model,
                             gl_step_cnt))
        p.start()
        processes.append(p)
        for rank in range(0, args.num_processes):
            p = mp.Process(target=train,
                           args=(rank, args, shared_model, gl_step_cnt,
                                 optimizer))
            p.start()
            processes.append(p)
Пример #3
0
def main(args):
    args.batch_size = None
    logger.debug('CONFIGURATION: {}'.format(args))
    """ Set up the graph, the agents, and run the agents in parallel. """
    if args.env == 'GYM':
        from environments import atari_environment
        num_actions = atari_environment.get_num_actions(args.game)
        input_shape = atari_environment.get_input_shape(args.game)
    else:
        num_actions = get_num_actions(args.rom_path, args.game)

    args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format(
        args.game, time.strftime('%m.%d/%H.%M'))
    logger.info('logging summaries to {}'.format(args.summ_base_dir))

    algorithms = {
        'q': (NStepQLearner, QNetwork),
        'sarsa': (OneStepSARSALearner, QNetwork),
        'dueling': (DuelingLearner, DuelingNetwork),
        'a3c': (A3CLearner, PolicyValueNetwork),
        'a3c-lstm': (A3CLSTMLearner, PolicyValueNetwork),
        'a3c-sequence-decoder':
        (ActionSequenceA3CLearner, SequencePolicyVNetwork),
        'pgq': (PGQLearner, PolicyValueNetwork),
        'pgq-lstm': (PGQLSTMLearner, PolicyValueNetwork),
        'trpo': (TRPOLearner, PolicyNetwork),
        'cem': (CEMLearner, PolicyNetwork),
        'q-cts': (PseudoCountQLearner, QNetwork),
        'a3c-cts': (PseudoCountA3CLearner, PolicyValueNetwork),
        'a3c-repeat': (ARA3CLearner, PolicyRepeatNetwork),
    }

    assert args.alg_type in algorithms, 'alg_type `{}` not implemented'.format(
        args.alg_type)
    Learner, Network = algorithms[args.alg_type]

    network = Network({
        'name': 'shared_vars_network',
        'input_shape': input_shape,
        'num_act': num_actions,
        'args': args
    })

    args.learning_vars = SharedVars(num_actions, args.alg_type, network)

    args.opt_state = SharedVars(
        num_actions,
        args.alg_type,
        network,
        opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None

    args.batch_opt_state = SharedVars(
        num_actions,
        args.alg_type,
        network,
        opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None

    if args.alg_type in ['q', 'sarsa', 'dueling', 'q-cts']:
        args.target_vars = SharedVars(num_actions, args.alg_type, network)
        args.target_update_flags = SharedFlags(args.num_actor_learners)

    args.barrier = Barrier(args.num_actor_learners)
    args.episode_counter = SharedCounter(0)
    args.global_step = SharedCounter(0)
    args.num_actions = num_actions

    if (args.visualize == 2): args.visualize = 0
    actor_learners = []
    experience_queue = Queue()
    for i in xrange(args.num_actor_learners):
        if (args.visualize == 2) and (i == args.num_actor_learners - 1):
            args.args.visualize = 1

        args.actor_id = i

        rng = np.random.RandomState(int(time.time()))
        args.random_seed = rng.randint(1000)

        #pass in gpu name to learner here and wrap each learner in device context
        args.queue = experience_queue  #only used by TRPO
        args.input_shape = input_shape
        actor_learners.append(Learner(args))
        actor_learners[-1].start()

    for t in actor_learners:
        t.join()

    logger.info('All training threads finished!')
Пример #4
0
def main(args):
    args.batch_size = None
    logger.debug('CONFIGURATION: {}'.format(args))
    """ Set up the graph, the agents, and run the agents in parallel. """
    if args.env == 'GYM':
        from environments import atari_environment
        num_actions, action_space, _ = atari_environment.get_actions(args.game)
        input_shape = atari_environment.get_input_shape(args.game)
    elif args.env == 'DOOM':
        from environments.vizdoom_env import VizDoomEnv
        env = VizDoomEnv(args.doom_cfg, args.game, args.is_train)
        num_actions, action_space = env.get_actions()
        input_shape = env.get_input_shape()
    else:
        num_actions = get_num_actions(args.rom_path, args.game)

    args.action_space = action_space
    args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format(
        args.game, time.strftime('%m.%d/%H.%M'))
    logger.info('logging summaries to {}'.format(args.summ_base_dir))

    Learner, Network = ALGORITHMS[args.alg_type]
    network = Network({
        'name': 'shared_vars_network',
        'input_shape': input_shape,
        'num_act': num_actions,
        'args': args
    })
    args.network = Network

    #initialize shared variables
    args.learning_vars = SharedVars(network.params)
    args.opt_state = SharedVars(
        network.params, opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None
    args.batch_opt_state = SharedVars(
        network.params, opt_type=args.opt_type,
        lr=args.initial_lr) if args.opt_mode == 'shared' else None

    #TODO: need to refactor so TRPO+GAE doesn't need special treatment
    if args.alg_type in ['trpo', 'trpo-continuous']:
        if args.arch == 'FC':  #add timestep feature
            vf_input_shape = [input_shape[0] + 1]
        else:
            vf_input_shape = input_shape

        baseline_network = PolicyValueNetwork(
            {
                'name': 'shared_value_network',
                'input_shape': vf_input_shape,
                'num_act': num_actions,
                'args': args
            },
            use_policy_head=False)
        args.baseline_vars = SharedVars(baseline_network.params)
        args.vf_input_shape = vf_input_shape

    if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']:
        args.target_vars = SharedVars(network.params)
        args.target_update_flags = SharedFlags(args.num_actor_learners)
    if args.alg_type == 'dqn-cts':
        args.density_model_update_flags = SharedFlags(args.num_actor_learners)

    tf.reset_default_graph()
    args.barrier = Barrier(args.num_actor_learners)
    args.global_step = SharedCounter(0)
    args.num_actions = num_actions

    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
    num_gpus = 0
    if cuda_visible_devices:
        num_gpus = len(cuda_visible_devices.split())

    #spin up processes and block
    if (args.visualize == 2): args.visualize = 0
    actor_learners = []
    task_queue = Queue()
    experience_queue = Queue()
    seed = args.seed or np.random.randint(2**32)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    for i in xrange(args.num_actor_learners):
        if (args.visualize == 2) and (i == args.num_actor_learners - 1):
            args.args.visualize = 1

        args.actor_id = i
        args.device = '/gpu:{}'.format(i % num_gpus) if num_gpus else '/cpu:0'

        args.random_seed = seed + i

        #only used by TRPO
        args.task_queue = task_queue
        args.experience_queue = experience_queue

        args.input_shape = input_shape
        actor_learners.append(Learner(args))
        actor_learners[-1].start()

    try:
        for t in actor_learners:
            t.join()
    except KeyboardInterrupt:
        #Terminate with extreme prejudice
        for t in actor_learners:
            t.terminate()

    logger.info('All training threads finished!')
    logger.info('Use seed={} to reproduce'.format(seed))
Пример #5
0
def main(args):
    args.batch_size = None
    logger.debug('CONFIGURATION: {}'.format(args))

    """ Set up the graph, the agents, and run the agents in parallel. """
    if args.env == 'GYM':
        from environments import atari_environment
        num_actions, action_space, _ = atari_environment.get_actions(args.game)
        input_shape = atari_environment.get_input_shape(args.game)
    else:
        num_actions = get_num_actions(args.rom_path, args.game)

    args.action_space = action_space
    args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format(args.game, time.strftime('%m.%d/%H.%M'))
    logger.info('logging summaries to {}'.format(args.summ_base_dir))

    Learner, Network = ALGORITHMS[args.alg_type]
    #print("Learner is: {}".format(Learner))

    if args.alg_type !='AE':


        network = Network({
            'name': 'shared_vars_network',
            'input_shape': input_shape,
            'num_act': num_actions,
            'args': args
        })
        args.network = Network

    else:

        network_lower = Network({
            'name': 'shared_vars_network_lower',
            'input_shape': input_shape,
            'num_act': num_actions,
            'args': args
        })
        args.network_lower = Network

        network_upper = Network({
            'name': 'shared_vars_network_upper',
            'input_shape': input_shape,
            'num_act': num_actions,
            'args': args
        })
        args.network_upper = Network

    ## initialize visdom server
    args.visdom = visdom.Visdom(port=args.display_port, env='AE DQN')
    #initialize shared variables
    #TODO: !!!!!! only network lower params are being use, should check out if upper is also needed !!!!!!!
    if args.alg_type !='AE':
        args.learning_vars = SharedVars(network.params) #size, step and optimizer
        args.opt_state = SharedVars(
            network.params, opt_type=args.opt_type, lr=args.initial_lr
        ) if args.opt_mode == 'shared' else None
        args.batch_opt_state = SharedVars(
            network.params, opt_type=args.opt_type, lr=args.initial_lr
        ) if args.opt_mode == 'shared' else None
    else:
                #args.learning_vars = SharedVars(network_lower.params) #size, step and optimizer
                args.learning_vars_lower = SharedVars(network_lower.params) #size, step and optimizer
                args.learning_vars_upper = SharedVars(network_upper.params) #size, step and optimizer
                args.opt_state_lower = SharedVars(
                    network_lower.params, opt_type=args.opt_type, lr=args.initial_lr
                )
                args.opt_state_upper = SharedVars(
                    network_upper.params, opt_type=args.opt_type, lr=args.initial_lr
                ) if args.opt_mode == 'shared' else None
                args.batch_opt_state_lower = SharedVars(
                    network_lower.params, opt_type=args.opt_type, lr=args.initial_lr
                )
                args.batch_opt_state_uppper = SharedVars(
                    network_upper.params, opt_type=args.opt_type, lr=args.initial_lr
                ) if args.opt_mode == 'shared' else None


    #TODO: need to refactor so TRPO+GAE doesn't need special treatment
    if args.alg_type in ['trpo', 'trpo-continuous']:
        if args.arch == 'FC': #add timestep feature
            vf_input_shape = [input_shape[0]+1]
        else:
            vf_input_shape = input_shape

        baseline_network = PolicyValueNetwork({
            'name': 'shared_value_network',
            'input_shape': vf_input_shape,
            'num_act': num_actions,
            'args': args
        }, use_policy_head=False)
        args.baseline_vars = SharedVars(baseline_network.params)
        args.vf_input_shape = vf_input_shape

    if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']:
        args.target_vars = SharedVars(network.params)
        args.target_update_flags = SharedFlags(args.num_actor_learners)
    if args.alg_type in ['dqn-cts', 'a3c-cts', 'a3c-lstm-cts']: #TODO check density_model_update_flags
        args.density_model_update_flags = SharedFlags(args.num_actor_learners)

    if args.alg_type in ['AE']:
        #print("we are in main args.alg_type in [AE]")
        args.target_vars_lower = SharedVars(network_lower.params)
        args.target_vars_upper = SharedVars(network_upper.params)
        args.target_update_flags = SharedFlags(args.num_actor_learners)
        args.density_model_update_flags = SharedFlags(args.num_actor_learners)

    tf.reset_default_graph()
    args.barrier = Barrier(args.num_actor_learners)
    args.global_step = SharedCounter(0)
    #ars.shared_visualizer = Visualizer(args.num_actor_learners) ## TODO to make it shared between the processes
    args.num_actions = num_actions

    cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
    num_gpus = 0
    if cuda_visible_devices:
        num_gpus = len(cuda_visible_devices.split())

    #spin up processes and block
    # if (args.visualize == 2): args.visualize = 0
    actor_learners = []
    task_queue = Queue()
    experience_queue = Queue()
    seed = args.seed or np.random.randint(2**32)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    visualize = args.visualize
    for i in range(args.num_actor_learners):
        if (visualize == 2) and (i == args.num_actor_learners - 1):
            args.visualize = 1
        else:
            args.visualize = 0

        args.actor_id = i
        args.device = '/gpu:{}'.format(i % num_gpus) if num_gpus else '/cpu:0'

        args.random_seed = seed + i

        #only used by TRPO
        args.task_queue = task_queue
        args.experience_queue = experience_queue

        args.input_shape = input_shape
        actor_learners.append(Learner(args))
        actor_learners[-1].start()
        if i == 1:
            setup_kill_signal_handler(actor_learners[-1])

    try:
        for t in actor_learners:
            file_name = "myfile_"+str(t)
            with open("grpah", 'w') as file_name:
                wr = csv.writer(file_name, quoting=csv.QUOTE_ALL)
                wr.writerow(t.vis.plot_data['X'])
                wr.writerow(t.vis.plot_data['Y'])
                print ('[%s]' % ', '.join(map(str, t.vis.plot_data['X'])))
            t.join()
    except KeyboardInterrupt:
        #Terminate with extreme prejudice
        for t in actor_learners:

            t.terminate()

    logger.info('All training threads finished!')
    logger.info('Use seed={} to reproduce'.format(seed))
Пример #6
0
    args.input_shape = env.observation_space.shape
    #import ipdb; ipdb.set_trace()
    args.num_actions = env.action_space.n
    #args.num_actions = 3
    env.close()
    setup_loggings(args)
    shared_model = Model(args.input_shape, args.num_actions)
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()
    
    shared_stepcount = SharedCounter()
    
    if not args.debug:
        processes = []

        p = mp.Process(target=test, args=(args.num_processes, args, 
            shared_model, Model, make_env, shared_stepcount))
        p.start()
        processes.append(p)
        for rank in range(0, args.num_processes):
            p = mp.Process(target=train, args=(rank, args, shared_model, 
                Model, make_env, shared_stepcount, optimizer))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()