Exemplo n.º 1
0
def _get_operands_from(field):
    if isinstance(field, list):
        return [flat(map(_get_operands_from, field))]
    if not isinstance(field, dict):
        return [[]]
    if 'input' in field:
        return [[field['input']]]
    if 'agg' in field:
        return [field['operands']]
    ops = [x for op in field['operands'] for x in _get_operands_from(op) if x]
    return [flat(ops)] if field['op'] not in ('AND', 'OR') else ops
Exemplo n.º 2
0
def encoder(h, noise_std, update_BN):
    # Perform encoding for each layer
    h += tf.random_normal(tf.shape(h)) * noise_std
    h = tf.identity(h, "h0")

    for i, layer_spec in enumerate(layers):
        with tf.variable_scope("encoder_bloc_" + str(i + 1),
                               reuse=tf.AUTO_REUSE):
            # Create an encoder bloc if the layer type is dense or conv2d
            if layer_spec["type"] == "flat":
                h = flat(h, output_name="h")
            elif layer_spec["type"] == "max_pool_2x2":
                h = max_pool_2x2(h, output_name="h")
            else:
                if i == L - 1:
                    activation = tf.nn.softmax  # Only for the last layer
                else:
                    activation = tf.nn.relu
                h = encoder_bloc(h,
                                 layer_spec,
                                 noise_std,
                                 update_BN=update_BN,
                                 activation=activation)

    y = tf.identity(h, name="y")
    return y
Exemplo n.º 3
0
 def _clipped_minimize(self, optimizer, loss, vars, grad_name=None):
     grads, _ = zip(*optimizer.compute_gradients(loss, vars))
     grads, _ = tf.clip_by_global_norm(grads,
                                       clip_norm=self.dconfig.clip_gradient)
     if grad_name is not None:
         tf.summary.histogram(grad_name, utils.flat(grads))
     return optimizer.apply_gradients(zip(grads, vars))
Exemplo n.º 4
0
def process_dataset(dataset, n_clusters, pca_enabled=False):
    X = np.load("./data/" + dataset + '.npy')
    if pca_enabled:
        pca = utils.load_model(dataset + '_pca')
        X = pca.transform(utils.flat(X)).reshape(X.shape[0], X.shape[1], -1)
    segments = segmentation(X, n_clusters)
    np.save("./results/" + dataset + "_segments.npy", segments)
    color_segments = color_true_map(segments, back_color=[1, 1, 1])
    save_image(color_segments, dataset + "_segments")
    print("Segments:", len(np.bincount(segments.reshape(-1))) - 1)
Exemplo n.º 5
0
def generate_work_data(dataset, labels, colors, parameters, pca_enabled=False):
    X_img = np.load('./data/' + dataset + '.npy')
    y_img = np.load('./data/' + dataset + '_labels.npy')
    save_image(color_true_map(y_img, labels_colors=colors),
               dataset + "_labels")

    X = utils.flat(X_img)
    y = utils.flat(y_img)
    train_ratio, val_ratio = 0.1, 0.1
    test_ratio = 1 - (train_ratio + val_ratio)
    tv_mask, test_mask = utils.balanced_train_test_mask(
        y, np.isin(y, labels), test_ratio)
    train_mask, val_mask = utils.balanced_train_test_mask(
        y, tv_mask, val_ratio / (val_ratio + train_ratio))

    np.save("./data/" + dataset + "_train_mask.npy", train_mask)
    np.save("./data/" + dataset + "_val_mask.npy", val_mask)
    np.save("./data/" + dataset + "_test_mask.npy", test_mask)

    if pca_enabled:
        pca = utils.pca(X[tv_mask, :], 0.99)
        utils.save_model(pca, dataset + '_pca')
        train = pca.transform(X[train_mask, :])
        test = pca.transform(X[test_mask])
        flat = pca.transform(X)
    else:
        train = X[train_mask, :]
        test = X[test_mask, :]
        flat = X

    svc = utils.svc(train, y[train_mask], parameters["C"], parameters["gamma"])
    utils.save_model(svc, dataset + '_svc')
    test_pred = svc.predict(test)
    np.save("./data/" + dataset + "_test_pred.npy", test_pred)
    classification = svc.predict(flat).reshape(y_img.shape).astype(np.uint8)
    np.save("./data/" + dataset + "_clasification.npy", classification)
    save_image(color_true_map(classification, labels_colors=colors),
               dataset + "_clasification")

    score = utils.balanced_score(y[test_mask], test_pred)
    utils.save_json({"original": score}, dataset + "_original_score")
    print("Test Score:", score)
Exemplo n.º 6
0
def cpu_p_v2(*F, N=(500, 5000, 500)):
    '''Mostra gráfico com o consumo de tempo das funções *F, para
       entradas com tamanho variando no intervalo N=(500,5000,500).
       Um parâmetro E = g define a função que gera as entradas'''
    X = [x for x in range(N[0], N[1] + 1, N[2])]
    Y = list(map(lambda f: [(f, sample(range(0, i), i)) for i in X], F))
    Y = flat(Y)
    inicio = time()
    pool = multiprocessing.Pool()
    Y = pool.map(executar_teste, Y)
    print('\nTempo total: %.1fs' % (time() - inicio))
    Y = [[Y[j + i * (len(Y) // len(F))] for j in range(len(Y) // len(F))]
         for i in range(len(F))]
    mostrar_gráfico(X, Y, F)
Exemplo n.º 7
0
def action_loss(logits, action, criterion, log=None):
    """
        Sum of losses of one hot vectors encoding an action
        :param logits: network output vector of [action, [[type_i, ent_i], for i in ents]]
        :param action: target vector size [7]
        :param criterion: loss function
        :return:
        """
    losses = []
    for idx, action_part in enumerate(flat(action)):
        tgt = _variable(torch.LongTensor([action_part]))
        losses.append(criterion(logits[idx], tgt))
    loss = torch.stack(losses, 0).mean()
    if log is not None:
        sl.log_loss(losses, loss)
    return loss
Exemplo n.º 8
0
    def __init__(self, config, init_vars):
        import tensorflow as tf
        dconfig = utils.DotDict(config)

        plasma.load_plasma_tensorflow_op()

        store_socket = utils.get_store_socket()
        self.var_oid = None

        self.obj_vars = [
            tf.Variable(init_var, name='obj_var', dtype=tf.float32)
            for init_var in init_vars
        ]
        self.plasma_grads_oids = tf.placeholder(shape=[dconfig.agent_count],
                                                dtype=tf.string,
                                                name="plasma_grads_oids")
        self.plasma_vars_oid = tf.placeholder(shape=[],
                                              dtype=tf.string,
                                              name="plasma_vars_oids")

        shapes = [v.shape for v in self.obj_vars]
        grads = utils.reverse_flat(
            tf.reduce_mean([
                plasma.tf_plasma_op.plasma_to_tensor(
                    self.plasma_grads_oids[a],
                    dtype=tf.float32,
                    plasma_store_socket_name=store_socket)
                for a in range(dconfig.agent_count)
            ],
                           axis=0), shapes)

        obj_optimizer = tf.train.AdamOptimizer(
            learning_rate=dconfig.obj_func_learning_rate)
        self.train_obj_op = obj_optimizer.apply_gradients(
            zip(grads, self.obj_vars))
        with tf.control_dependencies([self.train_obj_op]):
            import tensorflow as tf
            self.update_vars = plasma.tf_plasma_op.tensor_to_plasma(
                [utils.flat(self.obj_vars)],
                self.plasma_vars_oid,
                plasma_store_socket_name=store_socket)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
Exemplo n.º 9
0
def segmentation(img, n_clusters, sigma=0.3):
    fcm = FCM(n_clusters=n_clusters, max_iter=10000, m=2)
    fcm.fit(utils.flat(img))
    abun = fcm.u.reshape((img.shape[0], img.shape[1], n_clusters))
    masks = np.empty(abun.shape, dtype=bool)
    for i in range(n_clusters):
        thresh = filters.threshold_otsu(abun[:, :, i])
        filters.gaussian(abun[:, :, i], sigma=sigma, output=abun[:, :, i])
        masks[:, :, i] = abun[:, :, i] > thresh
    masks[masks.sum(axis=2) > 1, :] = 0
    label_imgs = [np.zeros(img.shape, dtype=np.uint8)]
    for i in range(n_clusters):
        binary_opening(masks[:, :, i], out=masks[:, :, i])
        label_img = label(masks[:, :, i])
        label_img[label_img > 0] += np.max(label_imgs[-1])
        label_imgs.append(label_img)
    return np.dstack(label_imgs).sum(axis=2)
Exemplo n.º 10
0
    def vec_to_ix(self, _vec):
        """
        Input target vec representing cross entropy loss target  [1 0 2 0 0 0 0]
            Returns a one hot version of it as training input       [01 00, 100, 000, 000, 000]
        :param _vec:
        :return:
        """
        merged = flat(_vec)
        action = self.expr_o_h[merged[0]].unsqueeze(0)
        ix_ent = []
        merged = merged[1:]
        if self.mapping is not None:
            expr_str = EXPRESSIONS[_vec[0]]
            # print(expr_str)
            mp = self.mapping[expr_str]
            permute = mp['permute'] if 'permute' in mp else None
            insert_ = mp['insert'] if 'insert' in mp else None
            test = mp['tst'] if 'tst' in mp else None
            idx = mp['idx'] if 'idx' in mp else None
            if insert_ is not None and test is not None and idx is not None:
                text_ix = _vec[1:][test[0]]
                for i, insrt_ent in zip(idx, insert_):
                    if test[1] in self.lookup_ix_to_expr(text_ix):
                        merged.insert(i, insrt_ent)
                    else:
                        merged.append(insrt_ent)
            elif insert_ is not None and idx is None:
                for insrt_ent in insert_:
                    merged.append(insrt_ent)

            if permute is not None:
                nperm = np.argsort(permute)
                merged = np.asarray(merged)[nperm]
        else:
            # print(len(self.ents_o_h))
            for idx in range(len(self.ents_o_h) - len(merged)):
                merged.append(0)

        for idx, value in enumerate(merged):
            ix_ent.append(self.ents_o_h[idx][value].unsqueeze(0))
        return torch.cat([action, torch.cat(ix_ent, 1)], 1)
Exemplo n.º 11
0
def combined_ent_loss(logits, action, criterion, log=None):
    """
        some hand tunining of penalties for illegal actions...
            trying to force learning of types.

        action type => type_e...
        :param logits: network output vector of one_hot distributions
            [action, [type_i, ent_i], for i in ents]
        :param action: target vector size [7]
        :param criterion: loss function
        :return:
        """
    losses = []
    for idx, action_part in enumerate(flat(action)):
        tgt = _variable(torch.Tensor([action_part]).float())
        losses.append(criterion(logits[idx], tgt))
    lfs = [[losses[0]]]
    n = 2
    for l in(losses[i:i+n] for i in range(1, len(losses), n)):
        lfs.append(torch.stack(losses, 0).sum())
    loss = torch.stack(lfs, 0).mean()
    if log is not None:
        sl.log_loss(losses, loss)
    return loss
Exemplo n.º 12
0
 def merge_clauses(clauses):
     return Or(*flat(clause.sub_formulas for clause in clauses))
Exemplo n.º 13
0
 def visitConnector(self, obj):
     return flat(self.visit(form) for form in obj.sub_formulas)
Exemplo n.º 14
0
def train_plan(args, data, DNC, lstm_state, optimizer):
    """
        Things to test after some iterations:
         - on planning phase and on

         with goals - chose a goal and work toward that
        :param args:
        :return:
        """
    criterion = nn.CrossEntropyLoss().cuda(
    ) if args.cuda is True else nn.CrossEntropyLoss()
    cum_correct, cum_total, prob_times, n_success = [], [], [], 0
    penalty = 1.1

    for trial in range(args.iters):
        start_prob = time.time()
        phase_masks = data.make_new_problem()
        n_total, n_correct, prev_action, loss, stats = 0, 0, None, 0, []
        dnc_state = DNC.init_state(grad=False)
        lstm_state = DNC.init_rnn(grad=False)  # lstm_state,
        optimizer.zero_grad()

        for phase_idx in phase_masks:

            if phase_idx == 0 or phase_idx == 1:
                inputs = _variable(data.getitem_combined())
                logits, dnc_state, lstm_state = DNC(inputs, lstm_state,
                                                    dnc_state)
                _, prev_action = data.strip_ix_mask(logits)

            elif phase_idx == 2:
                mask = _variable(data.getmask())
                inputs = torch.cat([mask, prev_action], 1)
                logits, dnc_state, lstm_state = DNC(inputs, lstm_state,
                                                    dnc_state)
                _, prev_action = data.strip_ix_mask(logits)

            else:
                # sample from best moves
                actions_star, all_actions = data.get_actions(mode='both')
                if not actions_star:
                    break
                if args.zero_at == 'step':
                    optimizer.zero_grad()

                mask = data.getmask()
                prev_action = prev_action.cuda(
                ) if args.cuda is True else prev_action
                pr = u.depackage(prev_action)

                final_inputs = _variable(torch.cat([mask, pr], 1))
                logits, dnc_state, lstm_state = DNC(final_inputs, lstm_state,
                                                    dnc_state)
                exp_logits = data.ix_input_to_ixs(logits)

                guided = random.random() < args.beta
                # thing 1
                if guided:  # guided loss
                    final_action, lstep = L.naive_loss(exp_logits,
                                                       actions_star,
                                                       criterion,
                                                       log=True)
                else:  # pick own move
                    final_action, lstep = L.naive_loss(exp_logits,
                                                       all_actions,
                                                       criterion,
                                                       log=True)

                # penalty for todo tests this !!!!
                action_own = u.get_prediction(exp_logits)
                if args.penalty and not [tuple(flat(t)) for t in all_actions]:
                    final_loss = lstep * _variable([args.penalty])
                else:
                    final_loss = lstep

                if args.opt_at == 'problem':
                    loss += final_loss
                else:

                    final_loss.backward(retain_graph=args.ret_graph)
                    if args.clip:
                        torch.nn.utils.clip_grad_norm(DNC.parameters(),
                                                      args.clip)
                    optimizer.step()
                    loss = lstep

                data.send_action(final_action)

                if (trial + 1) % args.show_details == 0:
                    action_accs = u.human_readable_res(data, all_actions,
                                                       actions_star,
                                                       action_own, guided,
                                                       lstep.data[0])
                    stats.append(action_accs)
                n_total, _ = tick(n_total, n_correct, action_own,
                                  flat(final_action))
                n_correct += 1 if action_own in [
                    tuple(flat(t)) for t in actions_star
                ] else 0
                prev_action = data.vec_to_ix(final_action)

        if stats:
            arr = np.array(stats)
            correct = len([
                1 for i in list(arr.sum(axis=1)) if i == len(stats[0])
            ]) / len(stats)
            sl.log_acc(list(arr.mean(axis=0)), correct)

        if args.opt_at == 'problem':
            floss = loss / n_total
            floss.backward(retain_graph=args.ret_graph)
            if args.clip:
                torch.nn.utils.clip_grad_norm(DNC.parameters(), args.clip)
            optimizer.step()
            sl.writer.add_scalar('losses.end', floss.data[0], sl.global_step)

        n_success += 1 if n_correct / n_total > args.passing else 0
        cum_total.append(n_total)
        cum_correct.append(n_correct)
        sl.add_scalar('recall.pct_correct', n_correct / n_total,
                      sl.global_step)
        print(
            "trial {}, step {} trial accy: {}/{}, {:0.2f}, running total {}/{}, running avg {:0.4f}, loss {:0.4f}  "
            .format(trial, sl.global_step, n_correct, n_total,
                    n_correct / n_total, n_success, trial,
                    running_avg(cum_correct, cum_total), loss.data[0]))
        end_prob = time.time()
        prob_times.append(start_prob - end_prob)
    print("solved {} out of {} -> {}".format(n_success, args.iters,
                                             n_success / args.iters))
    return DNC, optimizer, lstm_state, running_avg(cum_correct, cum_total)
Exemplo n.º 15
0
 def visitAnd(self, obj):
     return And(*flat(self.visit(form).sub_formulas
         for form in obj.sub_formulas))
Exemplo n.º 16
0
    def future_policy_value(self,
                            x,
                            a,
                            trans,
                            seq_len,
                            seq_mask,
                            agent,
                            opt,
                            create_summary=False):
        """
        Computes the value of a policy according to the critic when updated using the objective function
        :param x: observations
        :param a: actions
        :param trans: entire tuple of transition (s_t, a_t, r_t, d_t, s_{t+1})
        :param seq_len: Length of trajectories
        :param seq_mask: Binary mask of trajectories
        :param agent: agent to compute value for
        :param opt: optimizer to use for the policy update
        :param create_summary: whether to create summary ops
        :return: tensor of batched future policy value
        """
        with tf.variable_scope('future_policy_value'):
            policy = agent.main.policy
            policy_vars = policy.trainable_variables
            # The replace manager can replace the policy variables with updated variables
            replace_manager = policy.variable_scope.custom_getter

            use_adam = self.dconfig.obj_func_second_order_adam
            step_size = self.dconfig.obj_func_second_order_stepsize
            step_count = self.dconfig.obj_func_second_order_steps + 1
            batch_size = self.dconfig.buffer_sample_size

            # Split tensors according to number of inner gradient descent steps
            x_s = tf.split(x, step_count, axis=0)
            a_s = tf.split(a, step_count, axis=0)
            if seq_len is not None:
                seq_len_s = tf.split(seq_len, step_count, axis=0)
                seq_mask_s = tf.split(seq_mask, step_count, axis=0)
            else:
                seq_len_s = utils.ConstArray()
                seq_mask_s = utils.ConstArray(seq_mask)
            trans_s = list(
                zip(*(tf.split(e, step_count, axis=0) for e in trans)))

            objective_val = None
            policy_grads = None
            opt_args_dict = {}
            current_vars = policy_vars
            var_names = [var.op.name for var in policy_vars]
            for i in range(step_count - 1):
                # Run policy
                policy_result = policy(x_s[i], seq_len=seq_len_s[i])
                # Run objective
                objective_val = self.objective(x_s[i], a_s[i], trans_s[i],
                                               seq_len_s[i], seq_mask_s[i],
                                               agent, policy_result,
                                               create_summary)
                # Compute policy gradients
                policy_grads = tf.gradients(objective_val * seq_mask_s[i],
                                            current_vars)

                if use_adam:

                    def grad_transform(grad, var, var_name):
                        if var_name in opt_args_dict:
                            opt_args = opt_args_dict[var_name]
                        else:
                            opt_args = []
                        new_grad, *opt_args = opt.adapt_gradients(grad,
                                                                  var,
                                                                  *opt_args,
                                                                  lr=step_size)
                        opt_args_dict[var_name] = opt_args
                        return new_grad
                else:

                    def grad_transform(grad, *args):
                        return step_size * grad

                # Use adam or vanilla SGD for inner gradient step
                transformed_grads = [
                    grad_transform(grad, var,
                                   var_name) for grad, var, var_name in zip(
                                       policy_grads, current_vars, var_names)
                ]

                one_step_updated_policy_vars = [
                    var - grad
                    for var, grad in zip(current_vars, transformed_grads)
                ]
                one_step_updated_policy_vars_dict = OrderedDict(
                    zip(var_names, one_step_updated_policy_vars))

                #               # Updates replace manager to run policy with updated variables in the next loop iteration
                replace_manager.replace_dict = one_step_updated_policy_vars_dict
                current_vars = one_step_updated_policy_vars

            # Run policy with final parameters
            future_policy = policy(x, seq_len=seq_len)
            replace_manager.replace_dict = None
            # Estimate the final policy value
            future_policy_value = agent.main.critic(
                x, future_policy.action) * seq_mask

            if create_summary:
                orig_policy = policy(x_s[-1], seq_len=seq_len_s[-1])
                partial_future_policy_value = future_policy_value[-batch_size:]
                tf.summary.histogram('objective_value', objective_val)
                tf.summary.histogram('policy_grads', utils.flat(policy_grads))
                tf.summary.histogram('policy_value', orig_policy.value)
                tf.summary.histogram('future_policy_value',
                                     partial_future_policy_value)
                tf.summary.histogram(
                    'policy_value_gain',
                    partial_future_policy_value - orig_policy.value)

                sample_axis = [
                    0, 1
                ] if self.dconfig.recurrent_time_steps > 1 else 0
                cor = utils.correlation(-orig_policy.value, objective_val,
                                        sample_axis)
                tf.summary.scalar('objective_critic_correlation',
                                  tf.squeeze(cor))

                grad, = tf.gradients(objective_val, policy_result.value)
                if grad is not None:
                    tf.summary.histogram('objective_critic_grads', grad)

        return future_policy_value
Exemplo n.º 17
0
 def __getitem__(self, key):
     if key not in self:
         return self._wrapper([key])
     else:
         return self._wrapper(flat([self[x] for x in self._aliases[key]]))
Exemplo n.º 18
0
    def _setup(self, dconfig, logdir):
        """
        Create tensorflow graph and summary writer
        :param dconfig: configuration to use to build the graph
        :param logdir: log directory to write tensorflow logs to
        """
        env = gym.make(dconfig.env_name)
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        act_limit = env.action_space.high[0]

        agent = Agent(dconfig, env)
        objective = Objective(dconfig)

        # Experience buffer
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     size=dconfig.buffer_size,
                                     discount_factor=dconfig.discount_factor)

        time = dconfig.recurrent_time_steps if dconfig.recurrent_time_steps > 1 else None

        # Create datasets from replay buffer
        replay_buffer_dataset = replay_buffer.create_dataset(
            dconfig.buffer_sample_size, time)
        replay_buffer_dataset_iterator = replay_buffer_dataset.make_initializable_iterator(
        )

        # If we perform multiple gradient steps in the inner loop, provide different data for each step
        large_batch_size = (self.dconfig.obj_func_second_order_steps +
                            1) * dconfig.buffer_sample_size
        large_replay_buffer_dataset = replay_buffer.create_dataset(
            large_batch_size, time)
        large_replay_buffer_dataset_iterator = large_replay_buffer_dataset.make_initializable_iterator(
        )

        handle = tf.placeholder(tf.string, shape=[])
        iterator = tf.data.Iterator.from_string_handle(
            handle, replay_buffer_dataset.output_types,
            replay_buffer_dataset.output_shapes)
        itr_elem = utils.DotDict(iterator.get_next())
        x_ph, a_ph, x2_ph, r_ph, d_ph, lens_ph = itr_elem.obs1, itr_elem.acts, itr_elem.obs2,\
                                                 itr_elem.rews, itr_elem.done, itr_elem.lens

        # Mask for different trajectory lengths
        if lens_ph is not None:
            seq_mask = tf.sequence_mask(lens_ph, time, dtype=tf.float32)
        else:
            seq_mask = tf.ones([], dtype=tf.float32)

        x_ph_behv = placeholder(obs_dim, name='ObsBehavior')
        timestep = tf.placeholder(tf.float32, [], 'timestep')

        if dconfig.policy_is_recurrent:
            state_shape = [2, 1, dconfig.policy_units]
            init_policy_state = tf.placeholder_with_default(
                tf.zeros(state_shape), [2, 1, dconfig.policy_units])
        else:
            init_policy_state = None

        transition = [
            x_ph, a_ph, x2_ph, r_ph[..., tf.newaxis], d_ph[..., tf.newaxis]
        ]

        # Learning rate annealing
        if dconfig.policy_update_start:
            base = dconfig.policy_lr_annealing_base
            lr_progress = (base**tf.minimum(
                1.0, timestep / dconfig.policy_update_start) - 1) / (base - 1)
        else:
            lr_progress = 1

        # Optimizers
        pi_optimizer = utils.TensorAdamOptimizer(
            learning_rate=dconfig.policy_learning_rate * lr_progress)
        q_optimizer = tf.train.AdamOptimizer(
            learning_rate=dconfig.critic_learning_rate)
        obj_optimizer = tf.train.AdamOptimizer(
            learning_rate=dconfig.obj_func_learning_rate)

        # Main outputs from computation graph
        main = agent.main
        policy = main.policy(x_ph, seq_len=lens_ph)
        pi_action = policy.action
        q1_pi = policy.value
        pi_behv = main.policy(x_ph_behv[:, tf.newaxis],
                              initial_state=init_policy_state)
        q1 = main.critic(x_ph, a_ph)
        q2 = main.critic2(x_ph, a_ph)
        obj = objective.objective(x_ph, a_ph, transition, lens_ph, seq_mask,
                                  agent, policy)

        # Target policy network
        pi_action_targ = agent.target.policy(x2_ph, seq_len=lens_ph).action

        # Target Q networks
        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_action_targ),
                                   stddev=dconfig.critic_noise)
        epsilon = tf.clip_by_value(epsilon, -dconfig.critic_noise_clip,
                                   dconfig.critic_noise_clip)
        a2 = pi_action_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)
        q1_targ = agent.target.critic(x2_ph, a2)
        q2_targ = agent.target.critic2(x2_ph, a2)

        # Bellman backup for Q functions, using Clipped Double-Q targets
        min_q_targ = tf.minimum(q1_targ, q2_targ)
        gamma = dconfig.discount_factor
        backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ +
                                  d_ph)

        # Objective function annealing
        if dconfig.obj_func_anneal_steps:
            progress = tf.minimum(1.0,
                                  timestep / dconfig.obj_func_anneal_steps)
            obj = progress * obj - (1 - progress) * q1_pi

        # TD3 losses
        pi_loss = -tf.reduce_mean(q1_pi * seq_mask)
        pi_obj_loss = tf.reduce_mean(obj * seq_mask)
        q1_loss = tf.reduce_mean((q1 - backup)**2 * seq_mask)
        q2_loss = tf.reduce_mean((q2 - backup)**2 * seq_mask)
        q_loss = q1_loss + q2_loss

        main_vars = sorted(get_vars('main', trainable_only=False),
                           key=lambda v: v.name)
        target_vars = sorted(get_vars('target', trainable_only=False),
                             key=lambda v: v.name)

        # Train policy directly using critic
        train_pi_op = self._clipped_minimize(pi_optimizer,
                                             pi_loss,
                                             get_vars('main/policy'),
                                             grad_name='ddpg_policy_grads')
        # Train policy using objective function
        train_pi_obj_op = self._clipped_minimize(
            pi_optimizer,
            pi_obj_loss,
            get_vars('main/policy'),
            grad_name='objective_policy_grads')
        # Train critic
        train_q_op = q_optimizer.minimize(q_loss,
                                          var_list=get_vars('main/critic'))
        tf.summary.histogram('policy_params',
                             utils.flat(get_vars('main/policy')))

        # Objective function loss
        q1_obj = objective.future_policy_value(
            x_ph,
            a_ph,
            transition,
            lens_ph,
            seq_mask,
            agent,
            pi_optimizer,
            create_summary=dconfig.obj_func_enabled)
        obj_loss = -tf.reduce_mean(q1_obj)

        # Objective function optimization using ray (send gradients to ObjectiveServer)
        obj_vars = get_vars('objective')
        store_socket = utils.get_store_socket()

        shapes = [v.shape for v in obj_vars]
        plasma_var_oid = tf.placeholder(shape=[],
                                        dtype=tf.string,
                                        name="plasma_var_oid")
        retrieved_vars = utils.reverse_flat(
            plasma.tf_plasma_op.plasma_to_tensor(
                plasma_var_oid,
                dtype=tf.float32,
                plasma_store_socket_name=store_socket), shapes)
        # Op to read new objective parameters from ray object store
        plasma_read_vars = [
            var.assign(retrieved)
            for var, retrieved in zip(obj_vars, retrieved_vars)
        ]

        grads, vars = zip(*obj_optimizer.compute_gradients(obj_loss, obj_vars))
        grads, _ = tf.clip_by_global_norm(grads,
                                          clip_norm=dconfig.clip_gradient)
        tf.summary.histogram('objective_params', utils.flat(vars))
        tf.summary.histogram('objective_param_grads', utils.flat(grads))
        objective_grads = grads
        # Op to send gradients to ObjectiveServer
        train_obj_op = obj_optimizer.apply_gradients(zip(
            objective_grads, vars))

        plasma_grad_oid = tf.placeholder(shape=[],
                                         dtype=tf.string,
                                         name="plasma_grad_oid")
        # Op to send gradients to ObjectiveServer
        plasma_write_grads = plasma.tf_plasma_op.tensor_to_plasma(
            [utils.flat(objective_grads)],
            plasma_grad_oid,
            plasma_store_socket_name=store_socket)

        # Print number of parameters
        print(f'''
        ===================================================================
        Parameters
        Policy {np.sum(np.prod(v.shape) for v in get_vars('main/policy'))}
        Critic {np.sum(np.prod(v.shape) for v in get_vars('main/critic'))}
        Objective {np.sum(np.prod(v.shape) for v in obj_vars)}
        ===================================================================
        ''')

        # Polyak averaging for target variables
        polyak = 1 - dconfig.target_network_update_speed
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(main_vars, target_vars)
        ])

        # Initializing target networks to match main variables
        target_init = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(main_vars, target_vars)
        ])

        # Ops for copying and resetting the policy (currently not used)
        reset_policy = tf.variables_initializer(get_vars('main'))
        copy_policy = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

        # Summaries
        tflog_utils.log_scalars(policy_loss=pi_loss, q_loss=q_loss)
        if dconfig.obj_func_enabled:
            tflog_utils.log_scalars(policy_obj_loss=pi_obj_loss,
                                    objective_loss=obj_loss)

        self.restore_savers = self._create_restore_savers(dconfig)
        self.saver = tf.train.Saver(max_to_keep=1000, save_relative_paths=True)
        self.summary = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(
            f'{logdir}_agent{self.worker_index}')

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())

        init_ops = [target_init]
        self.sess.run(init_ops)

        rb_handle, large_rb_handle = self.sess.run([
            replay_buffer_dataset_iterator.string_handle(),
            large_replay_buffer_dataset_iterator.string_handle()
        ])

        # Return all created tf ops
        return utils.DotDict(locals())
Exemplo n.º 19
0
 def _antialias(cls, columns):
     return set(flat(map(cls._aliases, columns)))
Exemplo n.º 20
0
 def __getitem__(self, key):
     if key not in self:
         return self._wrapper([key])
     else:
         return self._wrapper(flat([self[x] for x in self._aliases[key]]))