Exemplo n.º 1
0
def train_lm(testing=False):
    data = DataReader(token_to_id_path, segment_sepparator)

    # Create model nodes for the source and target inputs
    input_sequence, label_sequence = create_inputs(data.vocab_dim)

    # Create the model. It has three output nodes
    # z: the input to softmax that  provides the latent representation of the next token
    # cross_entropy: this is used training criterion
    # error: this a binary indicator if the model predicts the correct token
    z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim)

    # For measurement we use the (build in) full softmax.
    full_ce = C.cross_entropy_with_softmax(z, label_sequence)

    # print out some useful training information
    log_number_of_parameters(z) ; print()
    
    # Run the training loop
    num_trained_samples = 0
    num_trained_samples_since_last_report = 0

    # Instantiate the trainer object to drive the model training
    lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate)
    momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample)
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule,
                            gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                            gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (cross_entropy, error), learner)

    last_avg_ce = 0
    for epoch_count in range(num_epochs):
        for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch):
            arguments = ({input_sequence : features, label_sequence : labels})

            t_start = timeit.default_timer()
            trainer.train_minibatch(arguments)
            t_end =  timeit.default_timer()

            samples_per_second = token_count / (t_end - t_start)

            # Print progress report every num_samples_between_progress_report samples

            if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0:
                av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data)
                print_progress(samples_per_second, av_ce, num_trained_samples, t_start)
                num_trained_samples_since_last_report = 0
                last_avg_ce = av_ce

            num_trained_samples += token_count
            num_trained_samples_since_last_report += token_count

        if not testing:
            # after each epoch save the model
            model_filename = "models/lm_epoch%d.dnn" % epoch_count
            z.save(model_filename)
            print("Saved model to '%s'" % model_filename)

    return last_avg_ce
Exemplo n.º 2
0
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4,
                 target_update_interval=10000, monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma
        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval
        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = RepMem(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(input_shape, init=he_uniform(scale=0.01)),
                Dense(input_shape),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))])

        self._action_value_net.update_signature(Tensor[input_shape])

        self._target_net = self._action_value_net.clone(CloneMethod.freeze)


        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            q_targets = compute_q_targets(post_states, rewards, terminals)

            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            return huber_loss(q_targets, q_acted, 1.0)

        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer):
    
    # CNTK weights new gradient by (1-momentum) for unit gain, 
    # thus we divide Caffe's learning rate by (1-momentum)
    initial_learning_rate = 0.45 # equal to 0.045 in caffe
    initial_learning_rate *= minibatch_size / 32

    learn_rate_adjust_interval = 2
    learn_rate_decrease_factor = 0.94

    # Set learning parameters
    lr_per_mb = []
    learning_rate = initial_learning_rate
    for i in range(0, num_epochs, learn_rate_adjust_interval):
        lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval)
        learning_rate *= learn_rate_decrease_factor

    lr_schedule       = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule       = momentum_schedule(0.9)
    l2_reg_weight     = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe
    
    # Create learner
    local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule,
                                                l2_regularization_weight=l2_reg_weight)
    parameter_learner = data_parallel_distributed_learner(
        local_learner, 
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
Exemplo n.º 4
0
def init_trainer(config, text_lines, slot_value_lines):

    hidden_dim = config.hidden_dim

    segment_begin = config.segment_begin
    segment_end = config.segment_end

    data = DataReader(text_lines, slot_value_lines, segment_begin, segment_end)

    # Create model nodes for the source and target inputs
    vocab_dim = data.vocab_dim
    sv_dim = data.sv_dim

    input_sequence, sv_pair, label_sequence, inputH, inputC = create_inputs(hidden_dim, sv_dim, vocab_dim)
    model = create_model(hidden_dim, sv_dim, vocab_dim)
    z = model(input_sequence, inputH, inputC, sv_pair)
    # cross_entropy: this is used training criterion
    ce, err = cross_entropy_with_full_softmax(z, label_sequence, sv_dim, vocab_dim)

    learning_rate = config.learning_rate
    momentum_as_time_constant = config.momentum_as_time_constant
    clipping_threshold_per_sample = config.clipping_threshold_per_sample
    lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample)
    gradient_clipping_with_truncation = True
    momentum_schedule = momentum_as_time_constant_schedule(momentum_as_time_constant)
    # Instantiate the trainer object to drive the model training
    learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule,
			gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
			gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (ce, err), learner)
    inputs = [input_sequence, sv_pair, label_sequence, inputH, inputC]

    return data, z, trainer, inputs
def create_trainer(network, epoch_size, num_epochs, minibatch_size,
                   progress_writers):

    # CNTK weights new gradient by (1-momentum) for unit gain,
    # thus we divide Caffe's learning rate by (1-momentum)
    initial_learning_rate = 2.0  # equal to 0.2 in caffe
    initial_learning_rate *= minibatch_size / 128
    learn_rate_adjust_interval = 2
    learn_rate_decrease_factor = 0.94

    # Set learning parameters
    lr_per_mb = []
    learning_rate = initial_learning_rate
    for i in range(0, num_epochs, learn_rate_adjust_interval):
        lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval)
        learning_rate *= learn_rate_decrease_factor

    lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size)
    mm_schedule = momentum_schedule(0.9)
    l2_reg_weight = 0.0001  # CNTK L2 regularization is per sample, thus same as Caffe

    # Create learner
    learner = momentum_sgd(network['output'].parameters,
                           lr_schedule,
                           mm_schedule,
                           l2_regularization_weight=l2_reg_weight)

    # Create trainer
    return Trainer(network['output'], (network['ce'], network['pe']), learner,
                   progress_writers)
Exemplo n.º 6
0
def main(params):
    # Create output and log directories if they don't exist
    if not os.path.isdir(params['output_folder']):
        os.makedirs(params['output_folder'])

    if not os.path.isdir(params['log_folder']):
        os.makedirs(params['log_folder'])

    # Create the network
    network = create_network()

    # Create readers
    train_reader = cbf_reader(os.path.join(params['input_folder'], 'train{}.cbf'.format(params['prefix'])), is_training=True,
                              max_samples=cntk.io.INFINITELY_REPEAT)
    cv_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False,
                           max_samples=cntk.io.FULL_DATA_SWEEP)
    test_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False,
                             max_samples=cntk.io.FULL_DATA_SWEEP)

    input_map = {
        network['input']: train_reader.streams.front,
        network['target']: train_reader.streams.label
    }

    # Create learner
    mm_schedule = momentum_schedule(0.90)
    lr_schedule = learning_parameter_schedule([(40, 0.1), (40, 0.01)], minibatch_size=params['minibatch_size'])
    learner = cntk.adam(network['model'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=0.0005,
                        epoch_size=params['epoch_size'], minibatch_size=params['minibatch_size'])

    # Use TensorBoard for visual logging
    log_file = os.path.join(params['log_folder'], 'log.txt')
    pp_writer = cntk.logging.ProgressPrinter(freq=10, tag='Training', num_epochs=params['max_epochs'], log_to_file=log_file)
    tb_writer = cntk.logging.TensorBoardProgressWriter(freq=10, log_dir=params['log_folder'], model=network['model'])

    # Create trainer and training session
    trainer = Trainer(network['model'], (network['loss'], network['metric']), [learner], [pp_writer, tb_writer])
    test_config = TestConfig(minibatch_source=test_reader, minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map)
    cv_config = CrossValidationConfig(minibatch_source=cv_reader, frequency=(1, DataUnit.sweep),
                                      minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map)
    checkpoint_config = CheckpointConfig(os.path.join(params['output_folder'], model_name), frequency=(10, DataUnit.sweep), restore=params['restore'])

    session = training_session(trainer=trainer,
                               mb_source=train_reader,
                               mb_size=params['minibatch_size'],
                               model_inputs_to_streams=input_map,
                               max_samples=params['epoch_size'] * params['max_epochs'],
                               progress_frequency=(1, DataUnit.sweep),
                               checkpoint_config=checkpoint_config,
                               cv_config=cv_config,
                               test_config=test_config)

    cntk.logging.log_number_of_parameters(network['model'])
    session.train()

    # Save the trained model
    path = os.path.join(params['output_folder'], 'final_model.dnn')
    network['model'].save(path)
    print('Saved final model to', path)
Exemplo n.º 7
0
def train():
	print('Unpickling data (this could take a short while)')
	training_data = pickle.load(open('tmp_textdata.pickle', 'rb'))
	print('Preprocessing data (this could take a LONG while)...')
	do_subsampling(training_data, subsampling=4e-5, prog_freq=1e7)
	print('Preprocessing is done. Final # of training words: {}'.format(len(training_data.text_as_id_list)))
	mb_source = WordMinibatchSource(training_data, max_window_size)
	mb_num_samples = 128
	mb_size = minibatch_size_schedule(mb_num_samples)

	freq_list = training_data.id2freq
	token2id = training_data.token2id
	vocab_dim = len(freq_list)
	print(vocab_dim)
	input_vector, label_vector = create_inputs(vocab_dim)

	z, cross_entropy, error = create_model(input_vector, label_vector, freq_list, vocab_dim, hidden_dim) 

	lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample)
	lr_schedule2 = learning_rate_schedule([(3e-3)*(0.8**i) for i in range(10)], UnitType.sample, epoch_size=len(training_data.text_as_id_list)//2)
	mom_schedule = C.learners.momentum_schedule(0.005, UnitType.sample)
	gradient_clipping_with_truncation = True
	learner = C.learners.sgd(z.parameters, lr=lr_schedule2,
			    gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
			    gradient_clipping_with_truncation=gradient_clipping_with_truncation)

#	var_mom_schedule = C.learners.momentum_schedule(0.999, UnitType.sample)
#	learner2 = C.learners.adam(z.parameters,
#		lr=lr_schedule,
#		momentum=mom_schedule,
#		variance_momentum=var_mom_schedule,
#		epsilon=1.5e-8,
#		gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
#		gradient_clipping_with_truncation=gradient_clipping_with_truncation)

	progress_printer = C.logging.ProgressPrinter(freq=200, tag='Training')
	checkpoint_config = CheckpointConfig(frequency = 100000*mb_num_samples,
                                           filename = os.path.join(os.getcwd(), "word2vec_checkpoint"),
                                           restore = False)

	trainer = Trainer(z, (cross_entropy, error), [learner], progress_writers=[progress_printer])
	
	input_map = { input_vector: mb_source.fsi, label_vector: mb_source.lsi }	

	session = training_session(trainer, mb_source, mb_size, input_map, progress_frequency=len(training_data.text_as_id_list), max_samples = None, checkpoint_config=checkpoint_config, cv_config=None, test_config=None)
	
	C.logging.log_number_of_parameters(z) ; print()
	session.train()
Exemplo n.º 8
0
# define loss / metrics
# like Tensorflow the softmax is done
# internally (if needed), so all we need are the logits
ce = cross_entropy_with_softmax(logits, labels)
pe = classification_error(logits, labels)

# training config
batch_size = 32
epochs = 15
n_batches = len(Xtrain) // batch_size

# do the training

# specify the training algorithm
trainer = Trainer(logits, (ce, pe),
                  adam(logits.parameters, lr=1e-2, momentum=0.9))


# helper function
def get_output(node, X, Y):
    ret = node.forward(dict(inputs=X, labels=Y))
    return list(ret[1].values())[0].mean()


costs = []
errors = []
test_costs = []
test_errors = []
for i in range(epochs):
    cost = 0
    err = 0
Exemplo n.º 9
0
def simple_mnist(tensorboard_logdir=None):
    input_dim = 19
    num_output_classes = 2
    num_hidden_layers = 2
    hidden_layers_dim = 1024

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim, np.float32)
    label = C.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    # scaled_input = element_times(constant(0.00390625), feature)

    z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)),
                    Dense(num_output_classes)])(feature)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = r"."

    path = os.path.normpath(os.path.join(data_dir, "train.ctf"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature  : reader_train.streams.features,
        label  : reader_train.streams.labels
    }

    # Training config
    minibatch_size = 512
    num_samples_per_sweep = 1825000
    num_sweeps_to_train_with = 100

    # Instantiate progress writers.
    progress_writers = [ProgressPrinter(
        tag='Training',
        num_epochs=num_sweeps_to_train_with)]

    if tensorboard_logdir is not None:
        tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)
        progress_writers.append(tensorboard_writer)

    # Instantiate the trainer object to drive the model training
    lr = learning_parameter_schedule_per_sample(0.001)
    learner = create_learner(model=z)
    trainer = Trainer(z, (ce, pe), learner, progress_writers)

    num_minibatches_to_train = int(num_samples_per_sweep / minibatch_size * num_sweeps_to_train_with)

    model_dir = "model"
    for i in range(num_minibatches_to_train):
        mb = reader_train.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(mb)

        freq = int(num_samples_per_sweep / minibatch_size)
        if i > 0 and i % freq == 0:
            timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
            current_trainer_cp = os.path.join(model_dir, timestamp + "_epoch_" + str(freq) + ".trainer")
            trainer.save_checkpoint(current_trainer_cp)

            train_error = get_error_rate(os.path.join(data_dir, "train_subset.ctf"), input_map, input_dim,
                                         num_output_classes, trainer)
            valid_error = get_error_rate(os.path.join(data_dir, "validation.ctf"), input_map, input_dim, num_output_classes,
                                         trainer)

            if train_error > 0:
                tensorboard_writer.write_value("train_error", train_error, i)
            if valid_error > 0:
                tensorboard_writer.write_value("valid_error", valid_error, i)

    feat_path = os.path.normpath(os.path.join(data_dir, "test.ctf"))
    return get_error_rate(feat_path, input_map, input_dim, num_output_classes, trainer)
Exemplo n.º 10
0
def simple_mnist(tensorboard_logdir=None):
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 2
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim, np.float32)
    label = C.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    z = Sequential([
        For(range(num_hidden_layers),
            lambda i: Dense(hidden_layers_dim, activation=relu)),
        Dense(num_output_classes)
    ])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.dirname(os.path.abspath(__file__))
    path = os.path.join(data_dir, 'Train-28x28_cntk_text.txt')

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    # training_progress_output_freq = 100
    progress_writers = [
        ProgressPrinter(
            # freq=training_progress_output_freq,
            tag='Training',
            num_epochs=num_sweeps_to_train_with)
    ]

    if tensorboard_logdir is not None:
        progress_writers.append(
            TensorBoardProgressWriter(freq=10,
                                      log_dir=tensorboard_logdir,
                                      model=z))

    # Instantiate the trainer object to drive the model training
    lr = 0.001
    trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr), progress_writers)

    training_session(trainer=trainer,
                     mb_source=reader_train,
                     mb_size=minibatch_size,
                     model_inputs_to_streams=input_map,
                     max_samples=num_samples_per_sweep *
                     num_sweeps_to_train_with,
                     progress_frequency=num_samples_per_sweep).train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    C.debugging.start_profiler()
    C.debugging.enable_profiler()
    C.debugging.set_node_timing(True)

    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    C.debugging.stop_profiler()
    trainer.print_node_timing()

    # Average of evaluation errors of all test minibatches
    return test_result * 100 / num_minibatches_to_test
Exemplo n.º 11
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 learning_rate=1e-4,
                 momentum=0.95):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        with default_options(activation=relu, init=he_uniform()):
            # Convolution filter counts were halved to save on memory, no gpu :(
            self.model = Sequential([
                Convolution2D((8, 8), 16, strides=4, name='conv1'),
                Convolution2D((4, 4), 32, strides=2, name='conv2'),
                Convolution2D((3, 3), 32, strides=1, name='conv3'),
                Dense(256, init=he_uniform(scale=0.01), name='dense1'),
                Dense(action_dim,
                      activation=None,
                      init=he_uniform(scale=0.01),
                      name='actions')
            ])
            self.model.update_signature(Tensor[state_dim])

        # Create the target model as a copy of the online model
        self.target_model = None
        self.update_target()

        self.pre_states = input_variable(state_dim, name='pre_states')
        self.actions = input_variable(action_dim, name='actions')
        self.post_states = input_variable(state_dim, name='post_states')
        self.rewards = input_variable((), name='rewards')
        self.terminals = input_variable((), name='terminals')
        self.is_weights = input_variable((), name='is_weights')

        predicted_q = reduce_sum(self.model(self.pre_states) * self.actions,
                                 axis=0)

        # DQN - calculate target q values
        # post_q = reduce_max(self.target_model(self.post_states), axis=0)

        # DDQN - calculate target q values
        online_selection = one_hot(
            argmax(self.model(self.post_states), axis=0), self.action_dim)
        post_q = reduce_sum(self.target_model(self.post_states) *
                            online_selection,
                            axis=0)

        post_q = (1.0 - self.terminals) * post_q
        target_q = stop_gradient(self.rewards + self.gamma * post_q)

        # Huber loss
        delta = 1.0
        self.td_error = minus(predicted_q, target_q, name='td_error')
        abs_error = abs(self.td_error)
        errors = element_select(less(abs_error, delta),
                                square(self.td_error) * 0.5,
                                delta * (abs_error - 0.5 * delta))
        loss = errors * self.is_weights

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_scheule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)

        self._learner = adam(self.model.parameters,
                             lr_schedule,
                             m_scheule,
                             variance_momentum=vm_schedule)
        self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                model=self.model)
        self.trainer = Trainer(self.model, (loss, None), [self._learner],
                               self.writer)
Exemplo n.º 12
0
def train_lm():
    data = DataReader(token_to_id_path, segment_sepparator)

    # Create model nodes for the source and target inputs
    input_sequence, label_sequence = create_inputs(data.vocab_dim)

    # Create the model. It has three output nodes
    # z: the input to softmax that  provides the latent representation of the next token
    # cross_entropy: this is used training criterion
    # error: this a binary indicator if the model predicts the correct token
    z, cross_entropy, error = create_model(input_sequence, label_sequence,
                                           data.vocab_dim, hidden_dim)

    # For measurement we use the (build in) full softmax.
    full_ce = C.cross_entropy_with_softmax(z, label_sequence)

    # print out some useful training information
    log_number_of_parameters(z)
    print()

    # Run the training loop
    num_trained_samples = 0
    num_trained_samples_since_last_report = 0

    # Instantiate the trainer object to drive the model training
    lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample)
    momentum_schedule = momentum_as_time_constant_schedule(
        momentum_as_time_constant)
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(
        z.parameters,
        lr_schedule,
        momentum_schedule,
        gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
        gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (cross_entropy, error), learner)

    for epoch_count in range(num_epochs):
        for features, labels, token_count in data.minibatch_generator(
                train_file_path, sequence_length, sequences_per_batch):
            arguments = ({input_sequence: features, label_sequence: labels})

            t_start = timeit.default_timer()
            trainer.train_minibatch(arguments)
            t_end = timeit.default_timer()

            samples_per_second = token_count / (t_end - t_start)

            # Print progress report every num_samples_between_progress_report samples

            if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0:
                av_ce = average_cross_entropy(full_ce, input_sequence,
                                              label_sequence, data)
                print_progress(samples_per_second, av_ce, num_trained_samples,
                               t_start)
                num_trained_samples_since_last_report = 0

            num_trained_samples += token_count
            num_trained_samples_since_last_report += token_count

        # after each epoch save the model
        model_filename = "models/lm_epoch%d.dnn" % epoch_count
        z.save_model(model_filename)
        print("Saved model to '%s'" % model_filename)
Exemplo n.º 13
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000),
                 learning_rate=0.01, momentum=0.8, minibatch_size=16,
                 memory_size=15000, train_after=100, train_interval=100, target_update_interval=500,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._num_trains = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        '''
        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        ''' 
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(7, init=he_uniform(scale=0.01)),
                Dense(8, init=he_uniform(scale=0.01)),
                #Dense(16, init=he_uniform(scale=0.01)),
                #Dense(32, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

        #self._trainer.restore_from_checkpoint('models/oldmodels/model800000')

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        #print(self._num_actions_taken)
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            #if (agent_step % self._train_interval) == 0:
            print('\nTraining minibatch\n')
            client.setCarControls(zero_controls)
            pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)
            self._trainer.train_minibatch(
                self._trainer.loss_function.argument_map(
                    pre_states=pre_states,
                    actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                    post_states=post_states,
                    rewards=rewards,
                    terminals=terminals
                )
            )
            self._num_trains += 1
            # Update the Target Network if needed
            if self._num_trains % 20 == 0:
                print('updating network')
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = dirname+"\model%d" % agent_step
                self._trainer.save_checkpoint(filename)
    
    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 14
0
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
Exemplo n.º 15
0
    def __init__(self,
                 in_shape,
                 output_shape,
                 device_id=None,
                 learning_rate=0.00025,
                 momentum=0.9,
                 minibatch_size=32,
                 update_interval=10000,
                 n_workers=1,
                 visualizer=None):
        """
        Q Neural Network following Mnih and al. implementation and default options.

        The network has the following topology:
        Convolution(32, (8, 8))
        Convolution(64, (4, 4))
        Convolution(64, (2, 2))
        Dense(512)

        :param in_shape: Shape of the observations perceived by the learner (the neural net input)
        :param output_shape: Size of the action space (mapped to the number of output neurons)

        :param device_id: Use None to let CNTK select the best available device,
                          -1 for CPU, >= 0 for GPU
                          (default: None)

        :param learning_rate: Learning rate
                              (default: 0.00025, as per Mnih et al.)

        :param momentum: Momentum, provided as momentum value for
                         averaging gradients without unit gain filter
                         Note that CNTK does not currently provide an implementation
                         of Graves' RmsProp with momentum.
                         It uses AdamSGD optimizer instead.
                         (default: 0, no momentum with RProp optimizer)

        :param minibatch_size: Minibatch size
                               (default: 32, as per Mnih et al.)

        :param n_workers: Number of concurrent worker for distributed training.
                          (default: 1, not distributed)

        :param visualizer: Optional visualizer allowing the model to save summary data
                           (default: None, no visualization)

        Ref: Mnih et al.: "Human-level control through deep reinforcement learning."
        Nature 518.7540 (2015): 529-533.
        """

        assert learning_rate > 0, 'learning_rate should be > 0'
        assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1'

        QModel.__init__(self, in_shape, output_shape)
        CntkModel.__init__(self, device_id, False, n_workers, visualizer)

        self._nb_actions = output_shape
        self._steps = 0
        self._target_update_interval = update_interval
        self._target = None

        # Input vars
        self._environment = Input(in_shape,
                                  name='env',
                                  dynamic_axes=(Axis.default_batch_axis()))
        self._q_targets = Input(1,
                                name='q_targets',
                                dynamic_axes=(Axis.default_batch_axis()))
        self._actions = Input(output_shape,
                              name='actions',
                              dynamic_axes=(Axis.default_batch_axis()))

        # Define the neural network graph
        self._model = self._build_model()(self._environment)
        self._target = self._model.clone(
            CloneMethod.freeze, {self._environment: self._environment})

        # Define the learning rate
        lr_schedule = as_learning_rate_by_sample(learning_rate, minibatch_size,
                                                 momentum, True)
        lr_schedule = learning_rate_schedule(lr_schedule, UnitType.sample)

        # AdamSGD optimizer
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._model.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     unit_gain=True,
                     variance_momentum=vm_schedule)

        if self.distributed_training:
            raise NotImplementedError('ASGD not implemented yet.')

        # _actions is a sparse 1-hot encoding of the actions done by the agent
        q_acted = reduce_sum(self._model * self._actions, axis=0)

        # Define the trainer with Huber Loss function
        criterion = huber_loss(q_acted, self._q_targets, 1.0)

        self._learner = l_sgd
        self._trainer = Trainer(self._model, (criterion, None), l_sgd)
Exemplo n.º 16
0
class QNeuralNetwork(CntkModel, QModel):
    """
    Represents a learning capable entity using CNTK
    """
    def __init__(self,
                 in_shape,
                 output_shape,
                 device_id=None,
                 learning_rate=0.00025,
                 momentum=0.9,
                 minibatch_size=32,
                 update_interval=10000,
                 n_workers=1,
                 visualizer=None):
        """
        Q Neural Network following Mnih and al. implementation and default options.

        The network has the following topology:
        Convolution(32, (8, 8))
        Convolution(64, (4, 4))
        Convolution(64, (2, 2))
        Dense(512)

        :param in_shape: Shape of the observations perceived by the learner (the neural net input)
        :param output_shape: Size of the action space (mapped to the number of output neurons)

        :param device_id: Use None to let CNTK select the best available device,
                          -1 for CPU, >= 0 for GPU
                          (default: None)

        :param learning_rate: Learning rate
                              (default: 0.00025, as per Mnih et al.)

        :param momentum: Momentum, provided as momentum value for
                         averaging gradients without unit gain filter
                         Note that CNTK does not currently provide an implementation
                         of Graves' RmsProp with momentum.
                         It uses AdamSGD optimizer instead.
                         (default: 0, no momentum with RProp optimizer)

        :param minibatch_size: Minibatch size
                               (default: 32, as per Mnih et al.)

        :param n_workers: Number of concurrent worker for distributed training.
                          (default: 1, not distributed)

        :param visualizer: Optional visualizer allowing the model to save summary data
                           (default: None, no visualization)

        Ref: Mnih et al.: "Human-level control through deep reinforcement learning."
        Nature 518.7540 (2015): 529-533.
        """

        assert learning_rate > 0, 'learning_rate should be > 0'
        assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1'

        QModel.__init__(self, in_shape, output_shape)
        CntkModel.__init__(self, device_id, False, n_workers, visualizer)

        self._nb_actions = output_shape
        self._steps = 0
        self._target_update_interval = update_interval
        self._target = None

        # Input vars
        self._environment = Input(in_shape,
                                  name='env',
                                  dynamic_axes=(Axis.default_batch_axis()))
        self._q_targets = Input(1,
                                name='q_targets',
                                dynamic_axes=(Axis.default_batch_axis()))
        self._actions = Input(output_shape,
                              name='actions',
                              dynamic_axes=(Axis.default_batch_axis()))

        # Define the neural network graph
        self._model = self._build_model()(self._environment)
        self._target = self._model.clone(
            CloneMethod.freeze, {self._environment: self._environment})

        # Define the learning rate
        lr_schedule = as_learning_rate_by_sample(learning_rate, minibatch_size,
                                                 momentum, True)
        lr_schedule = learning_rate_schedule(lr_schedule, UnitType.sample)

        # AdamSGD optimizer
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._model.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     unit_gain=True,
                     variance_momentum=vm_schedule)

        if self.distributed_training:
            raise NotImplementedError('ASGD not implemented yet.')

        # _actions is a sparse 1-hot encoding of the actions done by the agent
        q_acted = reduce_sum(self._model * self._actions, axis=0)

        # Define the trainer with Huber Loss function
        criterion = huber_loss(q_acted, self._q_targets, 1.0)

        self._learner = l_sgd
        self._trainer = Trainer(self._model, (criterion, None), l_sgd)

    @property
    def loss_val(self):
        return self._trainer.previous_minibatch_loss_average

    def _build_model(self):
        with default_options(init=he_uniform(), activation=relu, bias=True):
            model = Sequential([
                Convolution((8, 8), 32, strides=(4, 4)),
                Convolution((4, 4), 64, strides=(2, 2)),
                Convolution((3, 3), 64, strides=(1, 1)),
                Dense(512, init=he_uniform(0.01)),
                Dense(self._nb_actions, activation=None, init=he_uniform(0.01))
            ])
            return model

    def train(self, x, q_value_targets, actions=None):
        assert actions is not None, 'actions cannot be None'

        # We need to add extra dimensions to shape [N, 1] => [N, 1]
        if check_rank(q_value_targets.shape, 1):
            q_value_targets = q_value_targets.reshape((-1, 1))

        # Add extra dimensions to match shape [N, 1] required by one_hot
        if check_rank(actions.shape, 1):
            actions = actions.reshape((-1, 1))

        # We need batch axis
        if check_rank(x.shape, len(self._environment.shape)):
            x = prepend_batch_axis(x)

        self._trainer.train_minibatch({
            self._environment:
            x,
            self._actions:
            Value.one_hot(actions, self._nb_actions),
            self._q_targets:
            q_value_targets
        })

        # Counter number of train calls
        self._steps += 1

        # Update the model with the target one
        if (self._steps % self._target_update_interval) == 0:
            self._target = self._model.clone(
                CloneMethod.freeze, {self._environment: self._environment})

    def evaluate(self, data, model=QModel.ACTION_VALUE_NETWORK):
        # If evaluating a single sample, expand the minibatch axis
        # (minibatch = 1, input_shape...)
        if len(data.shape) == len(self.input_shape):
            data = prepend_batch_axis(data)  # Append minibatch dim

        if model == QModel.TARGET_NETWORK:
            predictions = self._target.eval({self._environment: data})
        else:
            predictions = self._model.eval({self._environment: data})
        return predictions.squeeze()
# internally (if needed), so all we need are the logits
ce = cross_entropy_with_softmax(logits, labels)
pe = classification_error(logits, labels)



# training config
batch_size = 32
epochs = 15
n_batches = len(Xtrain) // batch_size


# do the training

# specify the training algorithm
trainer = Trainer(logits, (ce, pe), adam(logits.parameters, lr=1e-2, momentum=0.9))


# helper function
def get_output(node, X, Y):
  ret = node.forward(dict(inputs=X, labels=Y))
  return list(ret[1].values())[0].mean()


costs = []
errors = []
test_costs = []
test_errors = []
for i in range(epochs):
  cost = 0
  err = 0
Exemplo n.º 18
0
def simple_mnist(tensorboard_logdir=None):
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 1
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim, np.float32)
    label = C.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)),
                    Dense(num_output_classes)])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST")

    path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature  : reader_train.streams.features,
        label  : reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    #training_progress_output_freq = 100
    progress_writers = [ProgressPrinter(
        #freq=training_progress_output_freq,
        tag='Training',
        num_epochs=num_sweeps_to_train_with)]

    if tensorboard_logdir is not None:
        progress_writers.append(TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z))

    # Instantiate the trainer object to drive the model training
    lr = learning_parameter_schedule_per_sample(1)
    trainer = Trainer(z, (ce, pe), adadelta(z.parameters, lr), progress_writers)

    training_session(
        trainer=trainer,
        mb_source = reader_train,
        mb_size = minibatch_size,
        model_inputs_to_streams = input_map,
        max_samples = num_samples_per_sweep * num_sweeps_to_train_with,
        progress_frequency=num_samples_per_sweep
    ).train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        feature  : reader_test.streams.features,
        label  : reader_test.streams.labels
    }

    # Test data for trained model
    C.debugging.start_profiler()
    C.debugging.enable_profiler()
    C.debugging.set_node_timing(True)
    #C.cntk_py.disable_cpueval_optimization() # uncomment this to check CPU eval perf without optimization

    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    C.debugging.stop_profiler()
    trainer.print_node_timing()

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
def simple_mnist():
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 2
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim)
    label = C.input_variable(num_output_classes)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    # z = Sequential([
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(num_output_classes)])(scaled_input)

    with default_options(activation=relu, init=C.glorot_uniform()):
        z = Sequential([
            For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim)),
            Dense(num_output_classes, activation=None)
        ])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    # setup the data
    path = abs_path + "\Train-28x28_cntk_text.txt"

    reader_train = MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(features=StreamDef(field='features', shape=input_dim),
                       labels=StreamDef(field='labels',
                                        shape=num_output_classes))))

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    progress_writers = [
        ProgressPrinter(tag='Training', num_epochs=num_sweeps_to_train_with)
    ]

    # Instantiate the trainer object to drive the model training
    lr = learning_rate_schedule(1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)],
                      progress_writers)

    training_session(trainer=trainer,
                     mb_source=reader_train,
                     mb_size=minibatch_size,
                     model_inputs_to_streams=input_map,
                     max_samples=num_samples_per_sweep *
                     num_sweeps_to_train_with,
                     progress_frequency=num_samples_per_sweep).train()

    # Load test data
    path = abs_path + "\Test-28x28_cntk_text.txt"

    reader_test = MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(features=StreamDef(field='features', shape=input_dim),
                       labels=StreamDef(field='labels',
                                        shape=num_output_classes))))

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
Exemplo n.º 20
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                    filename = "models\model%d" % agent_step
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 21
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=200000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)

    def load(self, model_path):

        self._trainer.restore_from_checkpoint(model_path)

    def act(self, state, eval=False):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken) and not eval:
            action = self._explorer(self.nb_actions)
            q_values = None
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action, q_values

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self, checkpoint_dir):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                #print('training... number of steps: {}'.format(agent_step))

                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)
                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = os.path.join(checkpoint_dir,
                                            "models\model%d" % agent_step)
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)

    def get_depth_image(self, client):
        # get depth image from airsim
        responses = client.simGetImages([
            airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective,
                                True, False)
        ])

        img1d = np.array(responses[0].image_data_float, dtype=np.float)
        img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
        if img1d.size > 1:

            img2d = np.reshape(img1d,
                               (responses[0].height, responses[0].width))

            image = Image.fromarray(img2d)
            im_final = np.array(image.resize((84, 84)).convert('L'))
            im_final = im_final / 255.0

            return im_final

        return np.zeros((84, 84)).astype(float)

    # Gets a coverage image from AirSim
    def get_cov_image(self, coverage_map):

        state, cov_reward = coverage_map.get_state_from_pose()
        #state = self.coverage_map.get_map_scaled()

        # debug only
        #im = Image.fromarray(np.uint8(state))
        #im.save("DistributedRL\\debug\\{}.png".format(time.time()))

        # normalize state
        state = state / 255.0

        return state, cov_reward
Exemplo n.º 22
0
def simple_mnist(tensorboard_logdir=None):
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 1
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = input(input_dim, np.float32)
    label = input(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)
    z = fully_connected_classifier_net(scaled_input, num_output_classes,
                                       hidden_layers_dim, num_hidden_layers,
                                       relu)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST")

    path = os.path.normpath(os.path.join(data_dir,
                                         "Train-28x28_cntk_text.txt"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    #training_progress_output_freq = 100
    progress_writers = [
        ProgressPrinter(
            #freq=training_progress_output_freq,
            tag='Training',
            num_epochs=num_sweeps_to_train_with)
    ]

    if tensorboard_logdir is not None:
        progress_writers.append(
            TensorBoardProgressWriter(freq=10,
                                      log_dir=tensorboard_logdir,
                                      model=z))

    # Instantiate the trainer object to drive the model training
    trainer = Trainer(z, (ce, pe), adadelta(z.parameters), progress_writers)

    training_session(trainer=trainer,
                     mb_source=reader_train,
                     mb_size=minibatch_size,
                     var_to_stream=input_map,
                     max_samples=num_samples_per_sweep *
                     num_sweeps_to_train_with,
                     progress_frequency=num_samples_per_sweep).train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
Exemplo n.º 23
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000),
                 fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=10000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._fixpolicy = fixpolicy
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        print("input_shape:", input_shape)
        print("input_shape[1:]", input_shape[1:])
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                #Convolution2D((8, 8), 16, strides=4),
                #Convolution2D((4, 4), 32, strides=2),
                #Convolution2D((1, 1), 16, strides=1),
                Dense(25, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)
        #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model")
    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)
        #if True:
        if self._fixpolicy.is_exploring(self._num_actions_taken):
            diff_x = state[3] - state[0]
            diff_y = state[4] - state[1]
            diff_z = state[5] - state[2]
            diff_arr = np.array([diff_x, diff_y, diff_z])
            direction = np.argmax(np.absolute(diff_arr))
            '''
            abs_x = math.fabs(diff_x)
            abs_y = math.fabs(diff_y)
            abs_z = math.fabs(diff_z)
            diff = [diff_x, diff_y, diff_z]
            abs_diff = [abs_x, abs_y, abs_z]
            print(diff, abs_diff)
            m = max(abs_diff)
            direction = diff.index(m)'''
            print(diff_arr)
            if diff_arr[direction] < 0:
                fixaction = direction + 4
            else:
                fixaction = direction + 1

            self._num_actions_taken += 1
            return fixaction

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken
        print("agent_step = ", agent_step)
        #time.sleep(1)
        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step
                    print(
                        "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=",
                        filename)
                    self._trainer.save_checkpoint(filename)
                    #time.sleep(100)

    def _plot_metrics(self):
        global landing_count, episode_count
        """Plot current buffers accumulated values to visualize agent learning
        """
        f = open('log__heuristic_no_image_less_exploration2', 'a+')
        f.write('episode:' + str(episode_count) + ': exploration rate= ' +
                str(self._explorer._rate) + ' heuristic fix rate= ' +
                str(self._fixpolicy._rate) + '\n')
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)
            print('Mean Q per ep.', mean_q, self._num_actions_taken)
            f.write('Mean Q per ep. ' + str(mean_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)
            print('Mean Std Q per ep.', std_q, self._num_actions_taken)
            f.write('Mean Std Q per ep. ' + str(std_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)
        print('Sum rewards per ep.', sum(self._episode_rewards),
              self._num_actions_taken)
        f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) +
                ' ' + str(self._num_actions_taken) + '\n')
        if landing_count > 0:
            f.write('****************Success landing**********' +
                    str(landing_count) + '\n')
        landing_count = 0
        episode_count = 0

        f.write('\n')
Exemplo n.º 24
0
    def __init__(self, input_shape, nb_actions,
                 gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000),
                 learning_rate=0.01, momentum=0.8, minibatch_size=16,
                 memory_size=15000, train_after=100, train_interval=100, target_update_interval=500,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._num_trains = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        '''
        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        ''' 
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(7, init=he_uniform(scale=0.01)),
                Dense(8, init=he_uniform(scale=0.01)),
                #Dense(16, init=he_uniform(scale=0.01)),
                #Dense(32, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
Exemplo n.º 25
0
class DQAgent(object):
    """docstring for DQAgent"""                                                             ############should modify @!@!
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4,
                 target_update_interval=10000, monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma
        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval
        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = RepMem(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(input_shape, init=he_uniform(scale=0.01)),
                Dense(input_shape),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))])

        self._action_value_net.update_signature(Tensor[input_shape])

        self._target_net = self._action_value_net.clone(CloneMethod.freeze)


        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            q_targets = compute_q_targets(post_states, rewards, terminals)

            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            return huber_loss(q_targets, q_acted, 1.0)

        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        self._history.append(state)

        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))
            action = q_values.argmax()

        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        self._episode_rewards.append(reward)

        if done:
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            self._history.reset()

        self._memory.append(old_state, action, reward, done)

    def train(self):

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

            if (agent_step % self._target_update_interval) == 0:
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = "model\model%d" % agent_step # save ???? not good at using %d
                self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):

        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 26
0
def simple_mnist():
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 2
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim)
    label = C.input_variable(num_output_classes)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    # z = Sequential([
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(hidden_layers_dim, activation=relu),
    #     Dense(num_output_classes)])(scaled_input)

    with default_options(activation=relu, init=C.glorot_uniform()):
        z = Sequential([For(range(num_hidden_layers),
            lambda i: Dense(hidden_layers_dim)),
            Dense(num_output_classes, activation=None)])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    # setup the data
    path = abs_path + "\Train-28x28_cntk_text.txt"

    reader_train = MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='features', shape=input_dim),
        labels=StreamDef(field='labels', shape=num_output_classes))))

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    progress_writers = [ProgressPrinter(
        tag='Training',
        num_epochs=num_sweeps_to_train_with)]

    # Instantiate the trainer object to drive the model training
    lr = learning_rate_schedule(1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers)

    training_session(
        trainer=trainer,
        mb_source=reader_train,
        mb_size=minibatch_size,
        model_inputs_to_streams=input_map,
        max_samples=num_samples_per_sweep * num_sweeps_to_train_with,
        progress_frequency=num_samples_per_sweep
    ).train()

    # Load test data
    path = abs_path + "\Test-28x28_cntk_text.txt"

    reader_test = MinibatchSource(CTFDeserializer(path, StreamDefs(
        features=StreamDef(field='features', shape=input_dim),
        labels=StreamDef(field='labels', shape=num_output_classes))))

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test
Exemplo n.º 27
0
class LearningAgent(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 learning_rate=1e-4,
                 momentum=0.95):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        with default_options(activation=relu, init=he_uniform()):
            # Convolution filter counts were halved to save on memory, no gpu :(
            self.model = Sequential([
                Convolution2D((8, 8), 16, strides=4, name='conv1'),
                Convolution2D((4, 4), 32, strides=2, name='conv2'),
                Convolution2D((3, 3), 32, strides=1, name='conv3'),
                Dense(256, init=he_uniform(scale=0.01), name='dense1'),
                Dense(action_dim,
                      activation=None,
                      init=he_uniform(scale=0.01),
                      name='actions')
            ])
            self.model.update_signature(Tensor[state_dim])

        # Create the target model as a copy of the online model
        self.target_model = None
        self.update_target()

        self.pre_states = input_variable(state_dim, name='pre_states')
        self.actions = input_variable(action_dim, name='actions')
        self.post_states = input_variable(state_dim, name='post_states')
        self.rewards = input_variable((), name='rewards')
        self.terminals = input_variable((), name='terminals')
        self.is_weights = input_variable((), name='is_weights')

        predicted_q = reduce_sum(self.model(self.pre_states) * self.actions,
                                 axis=0)

        # DQN - calculate target q values
        # post_q = reduce_max(self.target_model(self.post_states), axis=0)

        # DDQN - calculate target q values
        online_selection = one_hot(
            argmax(self.model(self.post_states), axis=0), self.action_dim)
        post_q = reduce_sum(self.target_model(self.post_states) *
                            online_selection,
                            axis=0)

        post_q = (1.0 - self.terminals) * post_q
        target_q = stop_gradient(self.rewards + self.gamma * post_q)

        # Huber loss
        delta = 1.0
        self.td_error = minus(predicted_q, target_q, name='td_error')
        abs_error = abs(self.td_error)
        errors = element_select(less(abs_error, delta),
                                square(self.td_error) * 0.5,
                                delta * (abs_error - 0.5 * delta))
        loss = errors * self.is_weights

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_scheule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)

        self._learner = adam(self.model.parameters,
                             lr_schedule,
                             m_scheule,
                             variance_momentum=vm_schedule)
        self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                model=self.model)
        self.trainer = Trainer(self.model, (loss, None), [self._learner],
                               self.writer)

    def act(self, state, epsilon):
        """
        Selects an action to take based on the epsilon greedy method
        :param state: The current state
        :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration)
        """
        if np.random.randn(1) < epsilon:
            # Explore (random action)
            return np.random.choice(self.action_dim)
        else:
            # Exploit (greedy action based on knowledge)
            return self.model.eval(state).argmax()

    def train(self, s, a, r, s_, t, w):
        """
        Updates the network weights using the given minibatch data
        :param s: Tensor[state_dim] Current state
        :param a: Tensor[int] Action taken at state s
        :param r: Tensor[float] State resulting from taking action a at state s
        :param s_: Tensor[state_dim] Reward received for taking action a at state s
        :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise
        :param w: Tensor[float] Importance sampling weights
        """
        a = Value.one_hot(a.tolist(), self.action_dim)
        td_error = self.trainer.train_minibatch(
            {
                self.pre_states: s,
                self.actions: a,
                self.rewards: r,
                self.post_states: s_,
                self.terminals: t,
                self.is_weights: w
            },
            outputs=[self.td_error])
        return td_error[0]

    def update_target(self):
        """
        Update the target network using the online network weights
        """
        self.target_model = self.model.clone(CloneMethod.freeze)

    def checkpoint(self, filename):
        self.trainer.save_checkpoint(filename)

    def save_model(self, filename):
        self.model.save(filename)