Пример #1
0
def train(seed, save_dir):
    set_global_seeds(seed)
    save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed)
    os.makedirs(save_dir_0)

    env = envs.make(args.env, 'classic_control')
    with tf.device(args.device):
        with tf.compat.v1.variable_scope('seed_%d' % seed):
            model = models.mlp([args.num_units] * args.num_layers,
                               init_mean=args.init_mean,
                               init_sd=args.init_sd)
            act = deepadfq.learn(
                env,
                q_func=model,
                lr=args.learning_rate,
                lr_decay_factor=args.learning_rate_decay_factor,
                lr_growth_factor=args.learning_rate_growth_factor,
                max_timesteps=args.nb_train_steps,
                buffer_size=args.buffer_size,
                batch_size=args.batch_size,
                exploration_fraction=args.eps_fraction,
                exploration_final_eps=args.eps_min,
                target_network_update_freq=args.target_update_freq,
                print_freq=args.nb_epoch_steps,
                checkpoint_freq=int(args.nb_train_steps / 5),
                learning_starts=args.nb_warmup_steps,
                gamma=args.gamma,
                prioritized_replay=bool(args.prioritized),
                prioritized_replay_alpha=args.prioritized_replay_alpha,
                callback=None,  #callback,
                alg=args.alg,
                scope=args.scope,
                sdMin=np.sqrt(args.varth),
                noise=args.noise,
                act_policy=args.act_policy,
                epoch_steps=args.nb_epoch_steps,
                eval_logger=Logger(args.env,
                                   'classic_control',
                                   save_dir=save_dir_0,
                                   render=bool(args.render)),
                save_dir=save_dir_0,
                test_eps=args.test_eps,
                gpu_memory=args.gpu_memory,
                render=bool(args.render),
            )
    if args.record == 1:
        env.moviewriter.finish()
Пример #2
0
def main():
    env = envs.create_env(None)
    model = models.mlp([64])
    act = simple.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.01,
        exploration_final_eps=0.0,
        print_freq=10,
        callback=callback,
        prioritized_replay=True
    )
    print("Saving model to {}_model.pkl".format(envs.VSTR))
    act.save("{}_model.pkl".format(envs.VSTR))
Пример #3
0
def main():

    env = gym.make("CartPole-v0")
    #env = gym.make("MountainCar-v0")
    model = models.mlp([256, 20])
    act = learn(env,
                q_func=model,
                lr=1e-2,
                max_timesteps=100000,
                buffer_size=90000,
                exploration_fraction=0.1,
                exploration_final_eps=0.1,
                print_freq=25,
                checkpoint_path='model_chkpoints/cart_model',
                callback=callback,
                param_noise=True)
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Пример #4
0
def train(hparams):
    #wandb.init(project="ebm-gaussians")

    seed_everything(hparams.seed)
    model = mlp(sizes=[2, 100, 100, 1], activation=nn.ReLU)
    optimizer = Adam(model.parameters(), lr=hparams.lr)

    # load dataset
    N_train = 5000

    X_train = sample_data(N_train)

    train_dl = DataLoader(X_train, batch_size=100, shuffle=True, num_workers=8)
    losses = []

    for _ in range(hparams.n_epochs):
        for x in train_dl:

            neg_x = torch.randn_like(x)
            neg_x = sample_langevin(neg_x, model, hparams.stepsize,
                                    hparams.n_steps)

            optimizer.zero_grad()

            pos_out = model(x)
            neg_out = model(neg_x)

            loss = (pos_out -
                    neg_out) + hparams.alpha * (pos_out**2 + neg_out**2)

            loss = loss.mean()
            loss.backward()

            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
            optimizer.step()

            losses.append(loss.item())
            # wandb.log({'loss': loss.item()})

    print('saving a trained model')
    torch.save(model, hparams.model_path)
Пример #5
0
def main():
    # Prepare data
    x_list, y_list = generate_training_data_lists()
    steps = len(x_list) / batch_size
    train_sequence = TrainSequence(x_list, y_list, batch_size)

    # Prepare model
    model = mlp(10000)
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc'])

    # Plot model    
    plot_model(model, to_file='./{}.png'.format(model_name))

    # Fit model
    history = model.fit_generator(train_sequence, epochs=epochs, steps_per_epoch=steps, verbose=1).history

    # Plot loss vs accuracy
    plot_performance(history, model_name, epochs, batch_size)

    # Write loss and acc to file
    dump_pickle('./history.pkl', history)

    # Save model
    model.save_weights('./{}_weights.h5'.format(model_name))
Пример #6
0
def main():
    run = True
    state = 2
    env_name = 'HumanoidFlagrunBulletEnv-v0'
    if state == 0:
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                           episode_life=True,
                                           clip_rewards=True,
                                           frame_stack=True,
                                           scale=True)
    else:
        env = gym.make(env_name)
    if isinstance(env.action_space, Box):
        output_size = env.action_space.shape[0]
    else:
        output_size = env.action_space.n

    with tf.Session() as sess:
        name = 'flag_rnd3'
        with tf.variable_scope(name):
            input = tf.placeholder(tf.float32,
                                   [None, *env.observation_space.shape])
            state_rms = RunningMeanStd(sess, shape=env.observation_space.shape)
            norm_input = tf.clip_by_value(
                (input - state_rms._mean) / tf.sqrt(state_rms._var), -5, 5)

            if state == 0:
                with tf.variable_scope('policy'):
                    network = models.nature_cnn(input)
                norm_input = norm_input[:, :, :, 0]
                with tf.variable_scope('target'):
                    target_net = models.add_dense(
                        models.nature_cnn(norm_input), 256, name='dense1')
                with tf.variable_scope('predict'):
                    predict_net = models.add_dense(
                        models.nature_cnn(norm_input), 256, name='dense1')
                with tf.variable_scope('value'):
                    value_net = models.nature_cnn(input)
                with tf.variable_scope('value_in'):
                    value_in_net = models.nature_cnn(input)
                model = RND(sess, input, state_rms, network, actiontype.Discrete, output_size, target_net, predict_net, value_in_net,\
                     value_network=value_net, gamma=0.999, learning_rate=lambda f : 0.0001, epochs=4, minibatch_size=4, beta2=0.01, name=name)
            else:
                if state == 1:
                    with tf.variable_scope('policy'):
                        network, seq_len, init_state, last_state = models.lstm(
                            models.mlp(input), 64)
                    with tf.variable_scope('target'):
                        target_net = models.add_dense(models.mlp(norm_input),
                                                      256,
                                                      name='dense2')
                    with tf.variable_scope('predict'):
                        predict_net = models.add_dense(models.mlp(norm_input),
                                                       256,
                                                       name='dense2')
                    with tf.variable_scope('value_in'):
                        value_in_net = models.mlp(input)
                    model = RND(sess, input, state_rms, network, actiontype.Discrete, output_size, target_net, predict_net, value_in_net, epochs=4, minibatch_size=8, gamma=0.99, beta2=0.01, epsilon=0.1,\
                        coef_in=1., learning_rate=lambda f : 2.5e-4*(1-f), name=name, )
                elif state == 2:
                    with tf.variable_scope('policy'):
                        network = models.mlp(norm_input)
                    with tf.variable_scope('target'):
                        target_net = models.add_dense(models.mlp(norm_input),
                                                      256,
                                                      name='dense2')
                    with tf.variable_scope('predict'):
                        predict_net = models.add_dense(models.mlp(norm_input),
                                                       256,
                                                       name='dense2')
                    with tf.variable_scope('value'):
                        value_net = models.mlp(norm_input)
                    with tf.variable_scope('value_in'):
                        value_in_net = models.mlp(norm_input)
                    model = RND(sess, input, state_rms, network, actiontype.Continuous, output_size, target_net, predict_net, value_in_net, value_network=value_net, epochs=10, minibatch_size=32, gamma=0.99, beta2=0.000, epsilon=0.2, \
                        coef_in=.5, learning_rate=lambda f : 3e-4*(1-f), name=name)
        if run:
            run_only(sess, model, env, render=True)
        else:
            if state == 0:
                train(sess,
                      model,
                      env_name,
                      10000000,
                      256,
                      num_envs=16,
                      atari=True)
            elif state == 1:
                train(sess, model, env_name, 5e6, 128, num_envs=16)
            elif state == 2:
                train(sess,
                      model,
                      env_name,
                      100e6,
                      2048,
                      num_envs=24,
                      log_interval=5)
        env.close()
Пример #7
0
def train():
    set_global_seeds(args.seed)
    directory = os.path.join(
        args.log_dir,
        '_'.join([args.env,
                  datetime.datetime.now().strftime("%m%d%H%M")]))
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        ValueError("The directory already exists...", directory)
    json.dump(vars(args),
              open(os.path.join(directory, 'learning_prop.json'), 'w'))

    env = envs.make(
        args.env,
        render=bool(args.render),
        record=bool(args.record),
        ros=bool(args.ros),
        dirname=directory,
        map_name=args.map,
        num_targets=args.nb_targets,
        im_size=args.im_size,
    )
    hiddens = args.hiddens.split(':')
    hiddens = [int(h) for h in hiddens]
    with tf.device(args.device):
        if args.env == 'TargetTracking-v5':
            import simple_imtracking as simple
            model = models.cnn_plus_mlp(
                convs=[(8, 4, 2), (16, 3, 1)],
                hiddens=hiddens,
                dueling=bool(args.dueling),
                init_mean=args.init_mean,
                init_sd=args.init_sd,
            )
        else:
            import simple_tracking as simple
            model = models.mlp(hiddens,
                               init_mean=args.init_mean,
                               init_sd=args.init_sd)

        act, records = simple.learn(
            env,
            q_func=model,
            lr=args.learning_rate,
            lr_decay_factor=args.learning_rate_decay_factor,
            lr_growth_factor=args.learning_rate_growth_factor,
            max_timesteps=args.nb_train_steps,
            buffer_size=args.buffer_size,
            batch_size=args.batch_size,
            exploration_fraction=args.eps_fraction,
            exploration_final_eps=args.eps_min,
            target_network_update_freq=args.target_update_freq,
            print_freq=10,
            checkpoint_freq=int(args.nb_train_steps / 10),
            learning_starts=args.nb_warmup_steps,
            gamma=args.gamma,
            prioritized_replay=bool(args.prioritized),
            prioritized_replay_alpha=args.prioritized_replay_alpha,
            callback=None,  #callback,
            epoch_steps=args.nb_epoch_steps,
            noise=args.noise,
            varTH=args.varth,
            alg=args.alg,
            gpu_memory=args.gpu_memory,
            act_policy=args.act_policy,
            save_dir=directory,
            nb_test_steps=args.nb_test_steps,
            scope=args.scope,
            test_eps=args.test_eps,
            render=(bool(args.render) or bool(args.ros)),
            map_name=args.map,
            num_targets=args.nb_targets,
            im_size=args.im_size,
        )
        print("Saving model to model.pkl")
        act.save(os.path.join(directory, "model.pkl"))
        plot(records, directory)
    memo = input("Memo for this experiment?: ")
    f = open(os.path.join(directory, "memo.txt"), 'w')
    f.write(memo)
    f.close()
    if args.record == 1:
        env.moviewriter.finish()
                                     ccp_alpha=0.005,
                                     min_samples_split=2)
    if option == "Perceptron":
        model = models.single_layer_perceptron(X,
                                               Y,
                                               labels,
                                               dataset_name,
                                               eta0=0.1,
                                               random_state=0,
                                               max_iter=100)
    if option == "MLP":
        model = models.mlp(X,
                           Y,
                           labels,
                           dataset_name,
                           random_state=0,
                           learning_rate=0.05,
                           activation='logistic',
                           hidden_layer_sizes=(6, ),
                           max_iter=500)
    if option == "XGBoost":
        model = models.xgboost_model(X, Y, labels, dataset_name)
    else:
        pass

menu = st.sidebar.checkbox("About Info")
if menu:
    st.write(
        "Supervised ML for Airbnb dataset. Using Streamlit for visualisation and applying Naive Bayes, Decision Tree, Single and Multi-layer Perceptron, XGBoost"
    )
    st.write(
Пример #9
0
    train_losses, test_losses = [], []
    for iteration in range(args.batch_size * args.num_batches + 1):
        if iteration % args.batch_size == 0:
            params = get_params(opt_state)
            train_loss = loss(params, (data['x'], data['dx']))
            train_losses.append(train_loss)
            test_loss = loss(params, (data['test_x'], data['test_dx']))
            test_losses.append(test_loss)
            if iteration % (args.batch_size * args.test_every) == 0:
                print(
                    f"iteration={iteration}, train_loss={train_loss:.6f}, test_loss={test_loss:.6f}"
                )
        opt_state = update_derivative(iteration, opt_state,
                                      (data['x'], data['dx']))

    params = get_params(opt_state)
    return params, train_losses, test_losses


if __name__ == "__main__":
    args = ObjectView(get_args())
    dblpend.get_dataset(t_span=[0, args.dataset_size], fps=1, samples=1)

    mlp = lagrangian_nns.mlp
    rng = jax.random.PRNGKey(args.seed)
    init_random_params, nn_forward_fn = mlp(args)
    _, init_params = init_random_params(rng, (-1, 4))
    model = (nn_forward_fn, init_params)
    data = dblpend.get_dataset(t_span=[0, args.dataset_size], fps=1, samples=1)

    result = train(args, model, data)
Пример #10
0
def train_VGG_classifier(use_validation=False, use_val_for_training = False, num_features=4096,
          learning_rate=0.0001, epochs=3000, threshold=0.5, exp='', batch_norm=True,
          mini_batch_size=64, save_plots=True, save_features=False, classification_method='MLP',
          val_size=10, weight_0=1, dataset_name='', features_file='', labels_file=''):
    # ========================================================================
    # FETCH FEATURE EXTRACTOR
    # ========================================================================
    model = VGG16(num_features)

    # ========================================================================
    # WEIGHT INITIALIZATION
    # ========================================================================
    layerscaffe = ['conv1_1', 'conv1_2', 'conv2_1', 'conv2_2', 'conv3_1',
		   'conv3_2', 'conv3_3', 'conv4_1', 'conv4_2', 'conv4_3',
		   'conv5_1', 'conv5_2', 'conv5_3', 'fc6', 'fc7', 'fc8']
    h5 = h5py.File(vgg_16_weights, 'r')

    layer_dict = dict([(layer.name, layer) for layer in model.layers])

    # Copy the weights stored in the 'vgg_16_weights' file to the
    # feature extractor part of the VGG16
    for layer in layerscaffe[:-3]:
        w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
        w2 = np.transpose(np.asarray(w2), (2,3,1,0))
        w2 = w2[::-1, ::-1, :, :]
        b2 = np.asarray(b2)
        layer_dict[layer].set_weights((w2, b2))

    # Copy the weights of the first fully-connected layer (fc6)
    layer = layerscaffe[-3]
    w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
    w2 = np.transpose(np.asarray(w2), (1,0))
    b2 = np.asarray(b2)
    layer_dict[layer].set_weights((w2, b2))

    # ========================================================================
    # FEATURE EXTRACTION
    # ========================================================================
    if save_features:
        saveFeatures(model, features_file, labels_file, features_key, labels_key, num_features)

    # ========================================================================
    # TRAINING
    # ========================================================================  

    adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
  
    h5features = h5py.File(features_file, 'r')
    h5labels = h5py.File(labels_file, 'r')
    
    # X_full will contain all the feature vectors extracted
    # from optical flow images
    X_full = h5features[features_key]
    _y_full = np.asarray(h5labels[labels_key])

    zeroes_full = np.asarray(np.where(_y_full==0)[0])
    ones_full = np.asarray(np.where(_y_full==1)[0])
    zeroes_full.sort()
    ones_full.sort()
    
    # Use a 5 fold cross-validation
    kf_falls = KFold(n_splits=5, shuffle=True)
    kf_falls.get_n_splits(X_full[zeroes_full, ...])
    
    kf_nofalls = KFold(n_splits=5, shuffle=True)
    kf_nofalls.get_n_splits(X_full[ones_full, ...])        

    sensitivities = []
    specificities = []
    fars = []
    mdrs = []
    accuracies = []
        
    fold_number = 1
    # CROSS-VALIDATION: Stratified partition of the dataset into
    # train/test sets
    for ((train_index_falls, test_index_falls),
    (train_index_nofalls, test_index_nofalls)) in zip(
        kf_falls.split(X_full[zeroes_full, ...]),
        kf_nofalls.split(X_full[ones_full, ...])
    ):

        train_index_falls = np.asarray(train_index_falls)
        test_index_falls = np.asarray(test_index_falls)
        train_index_nofalls = np.asarray(train_index_nofalls)
        test_index_nofalls = np.asarray(test_index_nofalls)

        X = np.concatenate((
            X_full[zeroes_full, ...][train_index_falls, ...],
            X_full[ones_full, ...][train_index_nofalls, ...]
        ))
        _y = np.concatenate((
            _y_full[zeroes_full, ...][train_index_falls, ...],
            _y_full[ones_full, ...][train_index_nofalls, ...]
        ))
        X_test = np.concatenate((
            X_full[zeroes_full, ...][test_index_falls, ...],
            X_full[ones_full, ...][test_index_nofalls, ...]
        ))
        y_test = np.concatenate((
            _y_full[zeroes_full, ...][test_index_falls, ...],
            _y_full[ones_full, ...][test_index_nofalls, ...]
        ))

        if use_validation:
            # Create a validation subset from the training set
            zeroes = np.asarray(np.where(_y==0)[0])
            ones = np.asarray(np.where(_y==1)[0])
            
            zeroes.sort()
            ones.sort()

            trainval_split_0 = StratifiedShuffleSplit(n_splits=1,
                            test_size=int(val_size/2),
                            random_state=7)
            indices_0 = trainval_split_0.split(X[zeroes,...],
                            np.argmax(_y[zeroes,...], 1))
            trainval_split_1 = StratifiedShuffleSplit(n_splits=1,
                            test_size=int(val_size/2),
                            random_state=7)
            indices_1 = trainval_split_1.split(X[ones,...],
                            np.argmax(_y[ones,...], 1))
            train_indices_0, val_indices_0 = indices_0.__next__()
            train_indices_1, val_indices_1 = indices_1.__next__()

            X_train = np.concatenate([X[zeroes,...][train_indices_0,...],
                        X[ones,...][train_indices_1,...]],axis=0)
            y_train = np.concatenate([_y[zeroes,...][train_indices_0,...],
                        _y[ones,...][train_indices_1,...]],axis=0)
            X_val = np.concatenate([X[zeroes,...][val_indices_0,...],
                        X[ones,...][val_indices_1,...]],axis=0)
            y_val = np.concatenate([_y[zeroes,...][val_indices_0,...],
                        _y[ones,...][val_indices_1,...]],axis=0)
        else:
            X_train = X
            y_train = _y

        # Balance the number of positive and negative samples so that
        # there is the same amount of each of them
        all0 = np.asarray(np.where(y_train==0)[0])
        all1 = np.asarray(np.where(y_train==1)[0])  

        if len(all0) < len(all1):
            all1 = np.random.choice(all1, len(all0), replace=False)
        else:
            all0 = np.random.choice(all0, len(all1), replace=False)
        allin = np.concatenate((all0.flatten(),all1.flatten()))
        allin.sort()
        X_train = X_train[allin,...]
        y_train = y_train[allin]
    
        # ==================== CLASSIFIER ========================
        if classification_method == 'MLP':
            classifier = mlp(num_features, batch_norm)
        else:
            # TODO: handle case where validation is not done
            new_feature_length = int(num_features / 4)
            data = sample_data([X_train, X_test, X_val], new_feature_length)
            X_train = data[0]
            X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
            X_test = data[1]
            X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
            X_val = data[2]
            X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))
            classifier = lstm(seq_length=1, feature_length=new_feature_length, nb_classes=1)

        fold_best_model_path = best_model_path + '{}_fold_{}.h5'.format(
            dataset_name,
            fold_number
        )
        classifier.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

        if not use_checkpoint:
            # ==================== TRAINING ========================     
            # weighting of each class: only the fall class gets
            # a different weight
            class_weight = {0: weight_0, 1: 1}

            callbacks = None
            if use_validation:
                # callback definition
                metric = 'val_loss'
                e = EarlyStopping(monitor=metric, min_delta=0, patience=2,
                        mode='auto')
                c = ModelCheckpoint(fold_best_model_path, monitor=metric,
                            save_best_only=True,
                            save_weights_only=False, mode='auto')
                callbacks = [e, c]
            validation_data = None
            if use_validation:
                validation_data = (X_val,y_val)
            _mini_batch_size = mini_batch_size
            if mini_batch_size == 0:
                _mini_batch_size = X_train.shape[0]

            history = classifier.fit(
                X_train, y_train, 
                validation_data=validation_data,
                batch_size=_mini_batch_size,
                epochs=epochs,
                shuffle=True,
                class_weight=class_weight,
                callbacks=callbacks
            )

            #if not use_validation:
            #   classifier.save(fold_best_model_path)

            plot_training_info(plots_folder + exp, ['accuracy', 'loss'],
                    save_plots, history.history)

            if use_validation and use_val_for_training:
                #classifier = load_model(fold_best_model_path)

                # Use full training set (training+validation)
                X_train = np.concatenate((X_train, X_val), axis=0)
                y_train = np.concatenate((y_train, y_val), axis=0)

                history = classifier.fit(
                    X_train, y_train, 
                    validation_data=validation_data,
                    batch_size=_mini_batch_size,
                    epochs=epochs,
                    shuffle='batch',
                    class_weight=class_weight,
                    callbacks=callbacks
                )

                classifier.save(fold_best_model_path)

        # ==================== EVALUATION ========================     
        # TODO: Load model as required
        # Load best model
        #print('Model loaded from checkpoint')
        #classifier = load_model(fold_best_model_path)

        predicted = classifier.predict(np.asarray(X_test))
        for i in range(len(predicted)):
            if predicted[i] < threshold:
                predicted[i] = 0
            else:
                predicted[i] = 1
        # Array of predictions 0/1
        predicted = np.asarray(predicted).astype(int)   
        # Compute metrics and print them
        cm = confusion_matrix(y_test, predicted,labels=[0,1])
        tp = cm[0][0]
        fn = cm[0][1]
        fp = cm[1][0]
        tn = cm[1][1]
        tpr = tp/float(tp+fn)
        fpr = fp/float(fp+tn)
        fnr = fn/float(fn+tp)
        tnr = tn/float(tn+fp)
        precision = tp/float(tp+fp)
        recall = tp/float(tp+fn)
        specificity = tn/float(tn+fp)
        f1 = 2*float(precision*recall)/float(precision+recall)
        accuracy = accuracy_score(y_test, predicted)
        
        print('FOLD {} results:'.format(fold_number))
        print('TP: {}, TN: {}, FP: {}, FN: {}'.format(tp,tn,fp,fn))
        print('TPR: {}, TNR: {}, FPR: {}, FNR: {}'.format(
                        tpr,tnr,fpr,fnr))   
        print('Sensitivity/Recall: {}'.format(recall))
        print('Specificity: {}'.format(specificity))
        print('Precision: {}'.format(precision))
        print('F1-measure: {}'.format(f1))
        print('Accuracy: {}'.format(accuracy))
        
        # Store the metrics for this epoch
        sensitivities.append(tp/float(tp+fn))
        specificities.append(tn/float(tn+fp))
        fars.append(fpr)
        mdrs.append(fnr)
        accuracies.append(accuracy)
        fold_number += 1

    print('5-FOLD CROSS-VALIDATION RESULTS ===================')
    print("Sensitivity: %.2f%% (+/- %.2f%%)" % (np.mean(sensitivities)*100.,
                        np.std(sensitivities)*100.))
    print("Specificity: %.2f%% (+/- %.2f%%)" % (np.mean(specificities)*100.,
                        np.std(specificities)*100.))
    print("FAR: %.2f%% (+/- %.2f%%)" % (np.mean(fars)*100.,
                    np.std(fars)*100.))
    print("MDR: %.2f%% (+/- %.2f%%)" % (np.mean(mdrs)*100.,
                    np.std(mdrs)*100.))
    print("Accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracies)*100.,
                        np.std(accuracies)*100.))
Пример #11
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    #    deicticShape = (3,3,2)
    #    deicticShape = (4,4,1)
    #    deicticShape = (4,4,2)
    deicticShape = (4, 4, 3)
    #    deicticShape = (3,3,4)
    num_deictic_patches = 25

    #    num_actions = 4
    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
#        convs=[(16,4,1)],
#        hiddens=[16],
#        dueling=True
#    )

# MLP version
    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        #        return U.BatchInput(deicticShape, name=name)

        # MLP version
        return U.BatchInput(
            [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])
        #        obsDeictic, patchesTiledStacked2 = getDeic([obs])

        #        # CNN version
        #        qCurr = getq(np.array(obsDeictic))

        # MLP version
        qCurr = getq(
            np.reshape(
                obsDeictic,
                [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeicticObsBatch(obses_t)
            #            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: CNN version
            #            qNext = getq(obses_tp1_deic)
            #            qCurr = getq(obses_t_deic)

            # Get curr, next values: MLP version
            qNext = getq(
                np.reshape(
                    obses_tp1_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))
            qCurr = getq(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]


#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )

# MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Пример #12
0
 def dense_concat_net(*args, **varargs):
     return models.mlp(out_size = 16, output_activation = tf.tanh, scope = "concat_net",
                       flatten = False,
                       *args, **varargs)
Пример #13
0
#                                 nomal=True,
#                                  fill_mode='constant')
# generator = train_datagen.flow_from_directory(file_path=train_file_path,
#             data_dir=data_dir, data_suffix=data_suffix,
#             label_dir=label_dir, label_suffix=label_suffix,
#             target_size=target_shape, color_mode='grayscale',
#             batch_size=batch_size, shuffle=True,
#             loss_shape=None)

scheduler = LearningRateScheduler(lr_scheduler)
callbacks = [scheduler]

# ################### checkpoint saver#######################
checkpoint = ModelCheckpoint(filepath=os.path.join(save_path,
                                                   'checkpoint_weights.h5'),
                             save_weights_only=True)  # .{epoch:d}
callbacks.append(checkpoint)

# model = srcnn(input_shape=input_shape, kernel_size=[3, 3])
model = mlp()
# model.load_weights('unet_optics_l2.h5')
model.compile(loss=mean_squared_error, optimizer='adadelta')
model.summary()
history = model.fit(input_data,
                    input_label,
                    batch_size=batch_size,
                    nb_epoch=epochs,
                    callbacks=callbacks,
                    verbose=1)

model.save_weights('mlp_noise100_64.h5')
Пример #14
0
def run_with_random_hyperparameters(_):
    # Loading within the process to save time
    from rnn_ppo import RNN_PPO
    import tensorflow as tf
    import loggy
    import random
    import schedules
    import models
    import discrete_maze.maze

    tf.reset_default_graph()
    log = loggy.Log("maze-hyperparam-search", autosave_freq = 15.0, autosave_vars_freq = 180.0, continuing = False)

    lr = 10 ** random.uniform(-5.5, -2.5)
    value_prop = 10 ** random.uniform(-1.5, 1.5)

    if random.random() > 0.8:
        separate_value_network = (lambda *args, **varargs:
            tf.squeeze(models.mlp(scope = "value_network", out_size = 1, hiddens = [64, 64],
                                  flatten = False, *args, **varargs), axis = 2))
    else:
        separate_value_network = None

    history_size = 1 if random.random() > 0.15 else random.randint(2, 5)
    id_size = 1 if random.random() > 0.15 else random.randint(2, 8)
    reward_type = random.choice(discrete_maze.maze.ExploreTask.reward_types)
    scale_reward_by_difficulty = random.random() > 0.5
    place_agent_far_from_dest = random.random() > 0.2
    agent_placement_prop = random.uniform(0.2, 0.9)
    time_penalty = 10 ** random.uniform(-2.3, -.8)
    invalid_move_penalty = 10 ** random.uniform(-1, 0.5)


    def dense_concat_net(*args, **varargs):
        return models.mlp(out_size = 16, output_activation = tf.tanh, scope = "concat_net",
                          flatten = False,
                          *args, **varargs)
    concat_net = dense_concat_net if random.random() > 0.5 else None

    params = {
        # 'env_creator': schedules.GridMazeSchedule(),
        'env_creator': schedules.ExploreCreatorSchedule(is_tree = False, history_size = history_size,
                                        id_size = id_size, reward_type = reward_type,
                                        scale_reward_by_difficulty = scale_reward_by_difficulty,
                                        place_agent_far_from_dest = place_agent_far_from_dest,
                                        agent_placement_prop = agent_placement_prop,
                                        time_penalty = time_penalty,
                                        invalid_move_penalty = invalid_move_penalty),
        'clip_ratio': random.uniform(0.18, 0.22), # this seems to be set well
        'max_policy_steps': random.randint(50, 100),
        'max_kl': random.uniform(0.01, 0.02),
        'lambda_gae': random.uniform(0.95, 1.0),
        'lr_schedule': (lambda t: lr),
        'value_prop_schedule': (lambda t: value_prop),
        'log': log,
        'gamma': random.uniform(0.95, 1.0),
        'min_observations_per_step': 4000,
        'render': False,
        'rnn_stacks': random.randint(1, 3),
        'hidden_units': 2 ** random.randint(4, 8),
        'separate_value_network': separate_value_network,
        'concat_net': concat_net
    }

    params = log.process_params(params)
    log.add_hyperparams(params)

    print("Running with parameters:", params)

    ppo = RNN_PPO(**params)
    ppo.initialize_variables()

    def early_stop(policy):
        maze_size = policy.log.get_last('current maze size', 4)
        steps = policy.log.get_last('simulation steps', 0)
        return maze_size == 4 and steps >= 50000

    ppo.optimize(500000, early_stop = early_stop)
    log.close()
Пример #15
0
from __future__ import division, print_function, absolute_import

import tflearn
import tflearn.datasets.mnist as mnist

from models import mlp


def train(net, trainX, trainY, testX, testY):

    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch=20,
                              validation_set=(testX, testY),
                              show_metric=True,
                              run_id="dense_model")


if __name__ == '__main__':

    trainX, trainY, testX, testY = mnist.load_data(one_hot=True)

    net = mlp()
    train(net, trainX, trainY, testX, testY)  
    
Пример #16
0
    drop_constant_features(epigenomes)
    robust_zscoring(epigenomes)
    run_correlation_tests(epigenomes, labels)
    scores = extremely_correlated(epigenomes)
    seaborn_plot_most_correlated(epigenomes, labels, scores, cell_line)
    seaborn_plot_least_correlated(epigenomes, labels, scores, cell_line)
    get_top_most_different(epigenomes, labels, cell_line)
    get_top_most_different_tuples(epigenomes, cell_line)
    pca_plot(epigenomes, labels, cell_line)
    tsne_plot(epigenomes, labels, cell_line)

    for region in ['enhancers', 'promoters']:
        set_shape(epigenomes, region)
        modelperc, kwargsperc = perceptron(500, 1024)
        modeltree, kwargstree = decision_tree(500)
        modelmlp, kwargsmlp = mlp(500, 1024)
        modelffnn, kwargsffnn = ffnn(500, 1024)
        models.extend([modeltree, modelperc, modelmlp, modelffnn])
        kwargs.extend([kwargstree, kwargsperc, kwargsmlp, kwargsffnn])
        train_result = train(epigenomes, labels, models, kwargs, region,
                             cell_line)
        barplot(train_result, cell_line, region)
        models.clear()
        kwargs.clear()
        print('Wilcoxon ' + region + ':')
        wilcoxon_test(train_result, 'FFNN', 'DecisionTreeClassifier')
        wilcoxon_test(train_result, 'FFNN', 'Perceptron')
        wilcoxon_test(train_result, 'FFNN', 'MLP')
        wilcoxon_test(train_result, 'Perceptron', 'DecisionTreeClassifier')
        wilcoxon_test(train_result, 'Perceptron', 'MLP')
        wilcoxon_test(train_result, 'MLP', 'DecisionTreeClassifier')
Пример #17
0
def train(seed, save_dir):
    set_global_seeds(seed)
    save_dir_0 = os.path.join(save_dir, 'seed_%d'%seed)
    os.makedirs(save_dir_0)
    env = envs.make(args.env,
                    'target_tracking',
                    render=bool(args.render),
                    record=bool(args.record),
                    directory=save_dir_0,
                    ros=bool(args.ros),
                    map_name=args.map,
                    num_targets=args.nb_targets,
                    im_size=args.im_size,
                    )
    with tf.device(args.device):
        with tf.compat.v1.variable_scope('seed_%d'%seed):
            hiddens = args.hiddens.split(':')
            hiddens = [int(h) for h in hiddens]
            if args.env == 'TargetTracking-v5':
                model = models.cnn_plus_mlp(
                                convs=[(4, 8, 4), (8, 4, 2)],
                                hiddens= hiddens,
                                dueling=bool(args.dueling),
                                init_mean = args.init_mean,
                                init_sd = args.init_sd,
                                inpt_dim = (args.im_size, args.im_size),
                )
            else:
                model = models.mlp(hiddens, init_mean=args.init_mean, init_sd=args.init_sd)
            act = deepadfq.learn(
                env,
                q_func=model,
                lr=args.learning_rate,
                lr_decay_factor=args.learning_rate_decay_factor,
                lr_growth_factor=args.learning_rate_growth_factor,
                max_timesteps=args.nb_train_steps,
                buffer_size=args.buffer_size,
                batch_size=args.batch_size,
                exploration_fraction=args.eps_fraction,
                exploration_final_eps=args.eps_min,
                target_network_update_freq=args.target_update_freq,
                checkpoint_freq=args.checkpoint_freq,
                learning_starts=args.nb_warmup_steps,
                gamma=args.gamma,
                prioritized_replay=bool(args.prioritized),
                prioritized_replay_alpha=args.prioritized_replay_alpha,
                callback=None,#callback,
                alg=args.alg,
                scope=args.scope,
                sdMin=np.sqrt(args.varth),
                noise=args.noise,
                act_policy=args.act_policy,
                epoch_steps=args.nb_epoch_steps,
                eval_logger=Logger(args.env,
                                env_type='target_tracking',
                                save_dir=save_dir_0,
                                render=bool(args.render),
                                figID=1,
                                ros=bool(args.ros),
                                map_name=args.map,
                                num_targets=args.nb_targets,
                                im_size=args.im_size,
                                eval_type=args.eval_type,
                                init_file_path=args.init_file_path,
                                ),
                save_dir=save_dir_0,
                test_eps=args.test_eps,
                gpu_memory=args.gpu_memory,
                render=(bool(args.render) or bool(args.ros)),
            )
            print("Saving model to model.pkl")
            act.save(os.path.join(save_dir_0, "model.pkl"))
    if args.record == 1:
        env.moviewriter.finish()
Пример #18
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("FrozenLake8x8nohole-v0")
    
#    robShape = (2,)
#    robShape = (3,)
#    robShape = (200,)
#    robShape = (16,)
    robShape = (64,)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

    # these params are specific to frozenlake
    def getOneHotObs(obs):
#        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs,:]

    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
    max_timesteps=50000
#    max_timesteps=10000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.0
    target_network_update_freq=500
#    prioritized_replay=False
    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16

#    # try mountaincar w/ different input dimensions
#    inputDims = [50,2]
    
    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    obs = getOneHotObs(obs)
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        new_obs = getOneHotObs(new_obs)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            obs = getOneHotObs(obs)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
Пример #19
0
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
    max_timesteps=40000
#    max_timesteps=80000
    learning_starts=1000
#    buffer_size=50000
    buffer_size=1000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
#    batch_size=32
#    batch_size=64
    batch_size=512
#    batch_size=1024
    train_freq=1

    obsShape = (8,8,1)
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (5,5,2)
#    deicticShape = (6,6,2)
#    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 16
#    num_deictic_patches = 9
#    num_deictic_patches = 1

#    num_actions = 4
#    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


#    # CNN version
#    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
###    model = models.cnn_to_mlp_2pathways(
###        convs=[(16,3,1)],
#        convs=[(32,3,1)],
###        convs=[(32,4,1)],
###        convs=[(16,4,1)],
##        hiddens=[16],
#        hiddens=[32],
#        dueling=True
#    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])
#    model = models.mlp([32])
    model = models.mlp([])

    q_func=model
#    lr=0.01
    lr=0.001
#    lr=0.0005
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
    def make_obsDeic_ph(name):

#        # CNN version
#        return U.BatchInput(deicticShape, name=name)
        
        # MLP version
        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func"
            )
    
    getqTarget = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func_target"
            )

    update_target = build_update_target(scope="deepq", 
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")
                      
    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func",
        grad_norm_clipping=1.
#        grad_norm_clipping=0.1
    )
    
    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
#    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])

##       CNN version
#        qCurr = getq(np.array(obsDeictic))
        
        # MLP version
        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))


        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
#        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # MONTE CARLO VERSION
        # update rewards to actual monte carlo experiences
        if done:
            replay_buffer.update_montecarlo(gamma)
            
        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
#            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
#            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
            
            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones,num_deictic_patches)
            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            actionsTiled = np.repeat(actions,num_deictic_patches)
            
#            # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION 
#            qNextTarget = getqTarget(obses_tp1_deic)
#            qNext = getq(obses_tp1_deic)
#            qCurr = getq(obses_t_deic)
            
            # Get curr, next values: MLP version
            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

#            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
#            obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2))
#            obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2))
#            obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2))
#            obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3]
#            obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2))
#            obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2))
#            obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2))
#            obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3]
#            qCurr = getq(np.array(obses_t_deic))
#            qNext = getq(np.array(obses_tp1_deic))
#            actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3]
#            actionsTiled = actionsTiled - 4 * (actionsTiled>3)
#            rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled]
#            donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled]            
            
            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:,-1,:],1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # BELLMAN VERSION
            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax

            # MONTE CARLO VERSION
            targets = rewardsTiled

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
            qCurrTargets = np.copy(qCurr)
            
            # Copy into cascade with pruning.
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen),0,actionsTiled] = targets
            for i in range(num_cascade-1):
                mask = targets < qCurrTargets[range(expLen),i,actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]
            
#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )
            
            # MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
                    qCurrTargets
                    )
                
        # Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Пример #20
0
    # log = loggy.Log("miniworld-ppo", autosave_freq = 30.0)
    log = loggy.Log("dense-gerem8",
                    autosave_freq=15.0,
                    autosave_vars_freq=60.0,
                    continuing=False)

    params = {
        'clip_ratio':
        0.2,
        'max_policy_steps':
        80,
        'max_val_steps':
        80,
        'max_kl':
        0.015,
        'model': (lambda *args, **varargs: models.mlp(*args, **varargs)),
        # 'model': (lambda *args, **varargs: models.mlp(models.miniworld_preprocess(*args, time = False), **varargs)),
        'value_model': (lambda *args, **varargs: tf.squeeze(
            models.mlp(out_size=1, *args, **varargs), axis=1)),
        # 'env_creator': schedules.GridMazeSchedule(),
        # 'env_creator': schedules.ExploreCreatorSchedule(is_tree = False, history_size = 1,
        #                                 id_size = 1, reward_type = 'penalty+finished', scale_reward_by_difficulty = False),
        # 'env_creator': schedules.DummyGymSchedule('LunarLander-v2'),
        # 'env_creator': schedules.DummyGymSchedule('MiniWorld-Hallway-v0'),
        'env_creator':
        schedules.ConstantMazeSchedule('saved_mazes/gerem8.dill'),
        'lr_schedule': (lambda t: 3e-4),
        'min_observations_per_step':
        5000,
        'log':
        log,
def main():

    # ******* Deictic parameters ********

    # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch
    # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape
    # patches in an entire image.
    # For example, there are 36 3x3 patches that are contained in an 8x8 observation space
    # (assuming no zero padding). You must set this number to correspond to deicticShape.

    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    deicticShape = (4, 4, 2)
    #    deicticShape = (4,4,4)

    #    num_deictic_patches = 36
    num_deictic_patches = 25

    # Desired network type. So far, I've done better w/ CNN
    WHICH_Q = "CNN"
    #    WHICH_Q = "MLP"

    # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works
    # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True.
    # OW, it doesn't converge.
    # PAIRED_NEXT -> use value of corresponding patch on the next step
    # MAX_NEXT -> use max value over all next-step patches
    NEXT_PATCH = "PAIRED_NEXT"
    #    NEXT_PATCH = "MAX_NEXT"

    # If MIN_OVER_BATCH is true, then we find the min value over all targets that have
    # the same corresponding patch. In principle, this should always help. The larger
    # the batch size, the more it should help. However, in practice, I find that
    # it seems to cap the maximum achievable performance. On the other hand, it can
    # help convergence when using NEXT_PATCH = "MAX_NEXT".
    #    MIN_OVER_BATCH = True
    MIN_OVER_BATCH = False

    # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade.
    # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is
    # equivalent to the standard DQN backup applied to the patches.
    # best here.
    MIN_OR_AVG_Q = "MIN"
    #    MIN_OR_AVG_Q = "AVG"

    # If true, ROTATION_AUGMENTATION augments the agent's experience with
    # rotated versions of the patches. I typically turn this off.
    #    ROTATION_AUGMENTATION = True
    ROTATION_AUGMENTATION = False

    # ******* Load the environment ********

    env = envstandalone.StandaloneEnv()
    obsShape = env.observation_space.shape
    num_actions = env.action_space.n

    # ******* Standard DQN parameters ********

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 1
    lr = 0.001
    batch_size = 32
    train_freq = 1
    num_cascade = 5  # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair
    num_cpu = 16
    replay_buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    if MIN_OR_AVG_Q == "MIN":
        minoravg = -1
    elif MIN_OR_AVG_Q == "AVG":
        minoravg = 0
    else:
        print("error")

    # ******* Create neural network model ********

    if WHICH_Q == "CNN":
        # conv model parameters: (num_outputs, kernel_size, stride)
        model = models.cnn_to_mlp(convs=[(32, 3, 1)],
                                  hiddens=[32],
                                  dueling=True)
        networkShapeOfObservation = [
            -1, deicticShape[0], deicticShape[1], deicticShape[2]
        ]
    elif WHICH_Q == "MLP":
        # MLP version
        #        model = models.mlp([8, 16])
        model = models.mlp([16, 32])
        #        model = models.mlp([32])
        #        model = models.mlp([])
        networkShapeOfObservation = [
            -1, deicticShape[0] * deicticShape[1] * deicticShape[2]
        ]
    else:
        print("WHICH_Q error: must select valid q-function")
    q_func = model

    # ******* Build tensorflow functions ********

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        if WHICH_Q == "CNN":
            return U.BatchInput(deicticShape, name=name)
        elif WHICH_Q == "MLP":
            return U.BatchInput(
                [deicticShape[0] * deicticShape[1] * deicticShape[2]],
                name=name)
        else:
            print("WHICH_Q error: must select valid q-function")

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.)

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    U.initialize()
    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        # get q-values for current deictic patches
        obsDeictic = getDeic([obs])
        qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, minoravg, :],
                                  0))  # USE CASCADE
        #        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape such that patches and batches are interleaved in the same column
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: NO ROTATION-AUGMENTATION
            qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation))
            qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation))

            #            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
            if ROTATION_AUGMENTATION:
                obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2))
                obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2))
                obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2))
                obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1,
                                     obses_t_deicRot2, obses_t_deicRot3]
                obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2))
                obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2))
                obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2))
                obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1,
                                       obses_tp1_deicRot2, obses_tp1_deicRot3]
                qCurr = getq(np.array(obses_t_deic))
                qNext = getq(np.array(obses_tp1_deic))
                actionsTiled = np.r_[actionsTiled, actionsTiled + 1,
                                     actionsTiled + 2, actionsTiled + 3]
                actionsTiled = actionsTiled - 4 * (actionsTiled > 3)
                rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled,
                                     rewardsTiled]
                donesTiled = np.r_[donesTiled, donesTiled, donesTiled,
                                   donesTiled]

            # Get value of next state
            if NEXT_PATCH == "PAIRED_NEXT":
                qNextmax = np.max(qNext[:, minoravg, :], 1)  # standard
            elif NEXT_PATCH == "MAX_NEXT":
                qNextTiled = np.reshape(qNext[:, minoravg, :],
                                        [-1, num_deictic_patches, num_actions])
                qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1),
                                     num_deictic_patches)
            else:
                print("error")

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Take min over targets in same group
            if MIN_OVER_BATCH:
                obses_t_deic_reshape = np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])
                unique_deic, uniqueIdx, uniqueCounts = np.unique(
                    obses_t_deic_reshape,
                    return_inverse=True,
                    return_counts=True,
                    axis=0)
                for i in range(np.shape(uniqueCounts)[0]):
                    targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i])

            # Copy into cascade with pruning.
            qCurrTargets = np.copy(qCurr)
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen), 0, actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(expLen), i, actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]

            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, networkShapeOfObservation),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Пример #22
0
def mlp_mnist_sgd_experiment(rng, sample_size, hidden_size, depth, initializer,
                             learning_rate, momentum, nesterov, epochs,
                             batch_size):
    X, Y = mix_datasets(*mnist_mlp_connector(load_mnist()))
    x_train, y_train, inds = sample_train((X, Y), sample_size, rng)
    assert len(x_train) == len(y_train)
    print(f"Sampled {len(x_train)} datapoints iid")

    model = mlp(Y.shape[1],
                depth=depth,
                hidden=hidden_size,
                initializer=initializer)
    opt = tf.keras.optimizers.SGD(learning_rate=learning_rate,
                                  momentum=momentum,
                                  nesterov=nesterov)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True,
                                                   label_smoothing=0)
    model.compile(optimizer=opt,
                  loss=loss,
                  metrics=[
                      tf.keras.metrics.CategoricalAccuracy(name="accuracy",
                                                           dtype=None)
                  ],
                  loss_weights=None,
                  weighted_metrics=None,
                  run_eagerly=None,
                  steps_per_execution=None)

    model_extra_summary(model)

    print(
        f"Training model for {epochs} epochs and with {batch_size} batch size."
    )
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=0,
              callbacks=None,
              validation_split=0.0,
              validation_data=None,
              shuffle=True,
              class_weight=None,
              sample_weight=None,
              initial_epoch=0,
              steps_per_epoch=None,
              validation_steps=None,
              validation_freq=1,
              max_queue_size=10,
              workers=1,
              use_multiprocessing=False)
    # DEBUG
    model.summary()

    # measure generalization error
    train_results = model.evaluate(x=x_train,
                                   y=y_train,
                                   batch_size=None,
                                   verbose=0,
                                   sample_weight=None,
                                   steps=None,
                                   callbacks=None,
                                   max_queue_size=10,
                                   workers=1,
                                   use_multiprocessing=False,
                                   return_dict=True)
    expected_results = model.evaluate(x=X,
                                      y=Y,
                                      batch_size=None,
                                      verbose=0,
                                      sample_weight=None,
                                      steps=None,
                                      callbacks=None,
                                      max_queue_size=10,
                                      workers=1,
                                      use_multiprocessing=False,
                                      return_dict=True)

    (Xtr_uniq, Ytr_uniq), (Xtest, Ytest) = retrieve_split((X, Y), inds)
    train_unique_results = model.evaluate(x=Xtr_uniq,
                                          y=Ytr_uniq,
                                          batch_size=None,
                                          verbose=0,
                                          sample_weight=None,
                                          steps=None,
                                          callbacks=None,
                                          max_queue_size=10,
                                          workers=1,
                                          use_multiprocessing=False,
                                          return_dict=True)

    train_risk = 1 - train_results["accuracy"]
    expected_risk = 1 - expected_results["accuracy"]
    train_unique_risk = 1 - train_unique_results["accuracy"]
    test_risk = 1. / len(Ytest) * (len(Y) * expected_risk -
                                   len(Ytr_uniq) * train_unique_risk)
    generalization = expected_risk - train_risk

    return {
        "train_risk": train_risk,
        "expected_risk": expected_risk,
        "generalization": generalization,
        "test_risk": test_risk,
        "train_unique_risk": train_unique_risk
    }
Пример #23
0
                'average reward': np.mean(path['reward_totals']),
                'std of reward': np.std(path['reward_totals']),
                'approximate action entropy': approximate_entropy,
                'simulation steps': steps,
                'value loss': value_loss
            }
            self.env_creator.add_logging_data(log_data)

            self.log.step(log_data)
            self.log.print_step()


if __name__ == '__main__':
    log = loggy.Log("maze-h1-pggae")
    vpgae = VanillaPolicyGAE(
        model = (lambda *args, **varargs: models.mlp(*args, **varargs)),
        value_model = (lambda *args, **varargs: tf.squeeze(models.mlp(out_size = 1,
                                                                *args, **varargs), axis = 1)),
        env_creator = schedules.ExploreCreatorSchedule(is_tree = False, history_size = 1,
                                        id_size = 1, reward_type = 'penalty+finished', scale_reward_by_difficulty = False),
        # env_creator = schedules.DummyGymSchedule('Acrobot-v1'),
        lr_schedule = (lambda t: 2e-4),
        value_lr_schedule = (lambda t: 2.4e-3),
        lambda_gae = .97,
        min_observations_per_step = 4000,
        log = log,
        gamma = 0.999,
        render = False,
        render_mod = 128
    )
    vpgae.initialize_variables()
Пример #24
0
            self.log.step(log_data)
            self.log.print_step()


if __name__ == '__main__':
    argument_parser = argparse.ArgumentParser(
        description="Train a network to navigate a discrete maze.")
    argument_parser.add_argument(
        "--history-size",
        default=1,
        type=int,
        help="Number of previous frames to give the network.")
    options = argument_parser.parse_args()

    log = loggy.Log("pg")
    vp = VanillaPolicy(
        model=(lambda *args, **varargs: models.mlp(
            hiddens=[64, 64], *args, **varargs)),
        # env_creator = schedules.ExploreCreatorSchedule(is_tree = False, history_size = options.history_size),
        env_creator=schedules.DummyGymSchedule('CartPole-v1'),
        lr_schedule=lambda t: 5e-3,
        min_observations_per_step=1000,
        log=log,
        gamma=1.0,
        render=True,
        render_mod=256)
    vp.initialize_variables()
    vp.optimize(100000)
    log.close()