def symbols_to_logits(ids): pos = tf.shape(ids)[1] logits = tf.to_float(tf.log(probabilities[pos - 1, :])) return logits
def symbols_to_logits(ids, _, states): pos = tf.shape(ids)[1] - 1 logits = tf.to_float(tf.log(probabilities[pos, :])) states["state"] += 1 return logits, states
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) # bias hidden_representation = tf.add(tf.matmul(x, W1), b1) W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size])) b2 = tf.Variable(tf.random_normal([vocab_size])) prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2), b2)) # TRAIN THE MODEL sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # make sure you do this! # define the loss function: cross_entropy_loss = tf.reduce_mean( -tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1])) # define the training step: train_step = tf.train.GradientDescentOptimizer(0.01).minimize( cross_entropy_loss) n_iters = 10000 # train for n_iter iterations for _ in range(n_iters): sess.run(train_step, feed_dict={x: x_train, y_label: y_train}) print( 'loss is : ', sess.run(cross_entropy_loss, feed_dict={ x: x_train, y_label: y_train }))
histogram_bias1 = tf.summary.histogram('Bias1', Bias1) histogram_pesos2 = tf.summary.histogram('Pesos2', Pesos2) histogram_bias2 = tf.summary.histogram('Bias2', Bias2) #scalar_pesos1 = tf.summary.scalar('Pesos1', Pesos1) #scalar_bias1 = tf.summary.scalar('Bias1', Bias1) #scalar_pesos2 = tf.summary.scalar('Pesos2', Pesos2) #scalar_bias2 = tf.summary.scalar('Bias2', Bias2) A = tf.sigmoid(tf.matmul(x_, Pesos1) + Bias1) Salida = tf.sigmoid(tf.matmul(A, Pesos2) + Bias2) #histogram_salida = tf.summary.histogram('Salida', Salida) #scalar_salida = tf.summary.scalar('salida',Salida) #Costo=tf.reduce_mean(abs(y_-Salida)) Costo=tf.reduce_mean((y_*tf.log(Salida)+((1 - y_)* tf.log(1.0-Salida)))*-1) #histogram_costo = tf.summary.histogram('Costo', Costo) train_step = tf.train.GradientDescentOptimizer(.9).minimize(Costo) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) #t_start = time.clock() writer = tf.summary.FileWriter('./graphs', sess.graph) for i in range(1000): #writer = tf.summary.FileWriter('./graphs', sess.graph) sess.run(train_step, feed_dict={x_: letras_x, y_: letras_y})
def processData(samples, iii, federated, tot_devices, fraction_training, neighbors_number, EPOCH_THRESHOLD): # eng = matlab.engine.start_matlab() eng = 0 global learning_rate learning_rate_local = learning_rate np.random.seed(1) tf.set_random_seed(1) # common initialization tf 1.13 # tf.random.set_seed(1) # database = sio.loadmat('dati_mimoradar/data_mmwave_900.mat') database = sio.loadmat(args.input_data) # database = sio.loadmat('dati_mimoradar/data_mmwave_450.mat') x_train = database['mmwave_data_train'] y_train = database['label_train'] y_train_t = to_categorical(y_train) x_train = ( x_train.astype('float32').clip(0) ) / 1000 # DATA PREPARATION (NORMALIZATION AND SCALING OF FFT MEASUREMENTS) x_train2 = x_train[iii * samples:((iii + 1) * samples - 1), :, :] # DATA PARTITION y_train2 = y_train_t[iii * samples:((iii + 1) * samples - 1), :] x_test = database['mmwave_data_test'] y_test = database['label_test'] x_test = (x_test.astype('float32').clip(0)) / 1000 y_test_t = to_categorical(y_test) total_batch2 = int(fraction_training / batch_size) # tf Graph Input x = tf.placeholder( tf.float32, [None, input_data1, input_data2]) # 512 POINT FFT RANGE MEASUREMENTS y = tf.placeholder(tf.float32, [None, classes]) # 0-7 HR distances (safe - unsafe) W_ext_l1 = tf.placeholder(tf.float32, [filter, filter, 1, number]) b_ext_l1 = tf.placeholder(tf.float32, [number]) W_ext_l2 = tf.placeholder(tf.float32, [multip * number, classes]) b_ext_l2 = tf.placeholder(tf.float32, [classes]) W2_ext_l1 = tf.placeholder(tf.float32, [filter, filter, 1, number]) b2_ext_l1 = tf.placeholder(tf.float32, [number]) W2_ext_l2 = tf.placeholder(tf.float32, [multip * number, classes]) b2_ext_l2 = tf.placeholder(tf.float32, [classes]) # Set model weights W_l1 = tf.Variable(tf.random_normal([filter, filter, 1, number])) b_l1 = tf.Variable(tf.random_normal([number])) W_l2 = tf.Variable(tf.zeros([multip * number, classes])) b_l2 = tf.Variable(tf.zeros([classes])) # Construct model Layer #1 CNN 1d, Layer #2 FC hidden0 = conv2d_f(x, W_ext_l1, b_ext_l1) hidden01 = tf.layers.max_pooling2d(hidden0, pool_size=stride, strides=stride, padding='SAME') # print(hidden01) # check hidden01 size # hidden01 = tf.nn.max_pool1d(hidden0, ksize=stride, strides=stride, padding='SAME') fc01 = tf.reshape(hidden01, [-1, multip * number]) pred = tf.nn.softmax(tf.matmul(fc01, W_ext_l2) + b_ext_l2) # example 2 layers hidden2 = conv2d_f(x, W2_ext_l1, b2_ext_l1) hidden02 = tf.layers.max_pooling2d(hidden2, pool_size=stride, strides=stride, padding='SAME') fc02 = tf.reshape(hidden02, [-1, multip * number]) pred2 = tf.nn.softmax(tf.matmul(fc02, W2_ext_l2) + b2_ext_l2) # example 2 layers # Minimize error using cross entropy cost = tf.reduce_mean(-tf.reduce_sum( y * tf.log(tf.clip_by_value(pred, 1e-15, 0.99)), reduction_indices=1)) cost2 = tf.reduce_mean(-tf.reduce_sum( y * tf.log(tf.clip_by_value(pred2, 1e-15, 0.99)), reduction_indices=1)) #gradients per layer grad_W_l1, grad_b_l1, grad_W_l2, grad_b_l2 = tf.gradients( xs=[W_ext_l1, b_ext_l1, W_ext_l2, b_ext_l2], ys=cost) new_W_l1 = W_l1.assign(W_ext_l1 - learning_rate * grad_W_l1) new_b_l1 = b_l1.assign(b_ext_l1 - learning_rate * grad_b_l1) new_W_l2 = W_l2.assign(W_ext_l2 - learning_rate * grad_W_l2) new_b_l2 = b_l2.assign(b_ext_l2 - learning_rate * grad_b_l2) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() # Initialize CFA consensus_p = CFA_process(federated, tot_devices, iii, neighbors_number, args.graph, compression, args.consensus_mode) neighbor_vector = consensus_p.getMobileNetwork_connectivity( iii, neighbors_number, tot_devices, args.graph - 1) # print(neighbor_vector.size) # Start training with tf.Session() as sess: sess.run(init) total_batch = int(samples / batch_size) # PRINTS THE TOTAL NUMBER OF MINI BATCHES # print(total_batch) # Training cycle val_loss = np.zeros(training_epochs) param_vector = np.ones(training_epochs) timings = np.ones(training_epochs) sgd_computational_time = np.ones(training_epochs) compression_computational_time = np.ones(training_epochs) for epoch in range(training_epochs): # changing neighbors on every round if randomized = true if randomized: neighbor_vector = consensus_p.getMobileNetwork_connectivity( iii, neighbors_number, tot_devices, args.graph - 1) for current_neighbor in range(neighbor_vector.size + 1): avg_cost = 0. avg_cost_test = 0. ######## sgd on local data start_time = time.time() ################ for i in range(total_batch): batch_xs = x_train2[i * batch_size:((i + 1) * batch_size - 1), :, :] batch_ys = y_train2[i * batch_size:((i + 1) * batch_size - 1), :] if (i == 0) and (epoch == 0): # initialization # W_val_l1 = np.zeros([512, 32]) W_val_l1 = np.random.normal( 0.0, 1.0, (filter, filter, 1, number)) # b_val_l1 = np.zeros([32]) b_val_l1 = np.random.normal(0.0, 1.0, number) W_val_l2 = np.zeros([multip * number, classes]) b_val_l2 = np.zeros([classes]) elif (i > 0): W_val_l1 = n_W_l1 # modify for minibatch updates b_val_l1 = n_b_l1 W_val_l2 = n_W_l2 # modify for minibatch updates b_val_l2 = n_b_l2 # Fit training using batch data n_W_l1, n_b_l1, n_W_l2, n_b_l2, c, g_W_l1, g_b_l1, g_W_l2, g_b_l2 = sess.run( [ new_W_l1, new_b_l1, new_W_l2, new_b_l2, cost, grad_W_l1, grad_b_l1, grad_W_l2, grad_b_l2 ], feed_dict={ x: batch_xs, y: batch_ys, W_ext_l1: W_val_l1, b_ext_l1: b_val_l1, W_ext_l2: W_val_l2, b_ext_l2: b_val_l2 }) avg_cost += c / total_batch # Training loss #################à sgd_computational_time[epoch] = sgd_computational_time[ epoch] + time.time() - start_time ################### # validation with tf.Session() as sess2: sess2.run(init) for i in range(total_batch2): # Construct model batch_xs = x_test[i * batch_size:((i + 1) * batch_size - 1), :, :] batch_ys = y_test_t[i * batch_size:((i + 1) * batch_size - 1), :] c = sess2.run(cost2, feed_dict={ x: batch_xs, y: batch_ys, W2_ext_l1: n_W_l1, b2_ext_l1: n_b_l1, W2_ext_l2: n_W_l2, b2_ext_l2: n_b_l2 }) avg_cost_test += c / total_batch2 val_loss[epoch] = avg_cost_test if epoch == 0: param_vector[epoch] = multip * number * classes else: param_vector[epoch] = counter_param print( 'Test Device: ' + str(iii) + ' Neighbor counter: ' + str(current_neighbor) + " Epoch:", '%04d' % (epoch + 1), "loss=", "{:.9f}".format(avg_cost_test)) ########################################################### # CFA: weights exchange (no gradients) # start_time = time.time() if args.consensus_mode == 0: # combine one at a time and run sgd after every combination if current_neighbor < neighbor_vector.size: stop_consensus = False W_val_l1, b_val_l1, W_val_l2, b_val_l2, counter_param, time_info, compression_time = consensus_p.getFederatedWeight( n_W_l1, n_W_l2, n_b_l1, n_b_l2, epoch, val_loss, args.eps, neighbor_vector[current_neighbor], stop_consensus) timings[epoch] = timings[epoch] + time_info else: # transmission of model parameters stop_consensus = True W_val_l1, b_val_l1, W_val_l2, b_val_l2, counter_param, time_info, compression_time = consensus_p.getFederatedWeight( n_W_l1, n_W_l2, n_b_l1, n_b_l2, epoch, val_loss, args.eps, [], stop_consensus) elif args.consensus_mode == 1: # sets an alternative implementation, combine all and run one SGD if current_neighbor == 0: stop_consensus = False W_val_l1, b_val_l1, W_val_l2, b_val_l2, counter_param, time_info, compression_time = consensus_p.getFederatedWeight( n_W_l1, n_W_l2, n_b_l1, n_b_l2, epoch, val_loss, args.eps, neighbor_vector, stop_consensus) timings[epoch] = timings[epoch] + time_info else: stop_consensus = True # enable transmission of model only, use as neighbors an empty W_val_l1, b_val_l1, W_val_l2, b_val_l2, counter_param, time_info, compression_time = consensus_p.getFederatedWeight( n_W_l1, n_W_l2, n_b_l1, n_b_l2, epoch, val_loss, args.eps, [], stop_consensus) break ############################################################### compression_computational_time[epoch] = compression_time ########################################################### print("Optimization Finished!") # DUMP RESULTS %Y-%m-%d-%H-%M-%S sio.savemat( 'results/dump_loss_g{}_n{}_c{}_m{}_con{}_rand{}_{}.mat'.format( args.graph, iii, compression, neighbors_number, args.consensus_mode, args.rand, time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())), { "val_acc": val_loss, "device": iii, "T_epochs": training_epochs, "T_set_per_device": training_set_per_device, "samples": samples, "param_vector": param_vector, "compression_method": compression, "execution_time": timings, "compression_computational_time": compression_computational_time, "sgd_computational_time": sgd_computational_time })
def mask(config: configure_pretraining.PretrainingConfig, inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0, disallow_from_mask=None, already_masked=None): """Implementation of dynamic masking. The optional arguments aren't needed for BERT/ELECTRA and are from early experiments in "strategically" masking out tokens instead of uniformly at random. Args: config: configure_pretraining.PretrainingConfig inputs: pretrain_data.Inputs containing input input_ids/input_mask mask_prob: percent of tokens to mask proposal_distribution: for non-uniform masking can be a [B, L] tensor of scores for masking each position. disallow_from_mask: a boolean tensor of [B, L] of positions that should not be masked out already_masked: a boolean tensor of [B, N] of already masked-out tokens for multiple rounds of masking Returns: a pretrain_data.Inputs with masking added """ # Get the batch size, sequence length, and max masked-out tokens N = config.max_predictions_per_seq B, L = modeling.get_shape_list(inputs.input_ids) # Find indices where masking out a token is allowed vocab = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case).vocab candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask) # Set the number of tokens to mask out per example num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32) num_to_predict = tf.maximum( 1, tf.minimum(N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32))) masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32) if already_masked is not None: masked_lm_weights *= (1 - already_masked) # Get a probability of masking each position in the sequence candidate_mask_float = tf.cast(candidates_mask, tf.float32) sample_prob = (proposal_distribution * candidate_mask_float) sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True) # Sample the positions to mask out sample_prob = tf.stop_gradient(sample_prob) sample_logits = tf.log(sample_prob) masked_lm_positions = tf.random.categorical(sample_logits, N, dtype=tf.int32) masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32) # Get the ids of the masked-out tokens shift = tf.expand_dims(L * tf.range(B), -1) #due to the flat operation flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1]) masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), flat_positions) masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1]) masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32) # Update the input ids replace_with_mask_positions = masked_lm_positions * tf.cast( tf.less(tf.random.uniform([B, N]), 1 - mask_prob), tf.int32) inputs_ids, _ = scatter_update(inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), replace_with_mask_positions) return pretrain_data.get_updated_inputs( inputs, input_ids=tf.stop_gradient(inputs_ids), masked_lm_positions=masked_lm_positions, masked_lm_ids=masked_lm_ids, masked_lm_weights=masked_lm_weights)
def _build_net(self): with tf.variable_scope("Actor" + self.suffix): with tf.name_scope('inputs' + self.suffix): self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name='observation' + self.suffix) self.tf_acts = tf.placeholder(tf.int32, [ None, ], name='actions_num' + self.suffix) self.tf_vt = tf.placeholder(tf.float32, [ None, ], name='actions_value' + self.suffix) self.tf_safe = tf.placeholder(tf.float32, [ None, ], name='safety_value' + self.suffix) self.entropy_weight = tf.placeholder( tf.float32, shape=(), name='entropy_weight_clustering' + self.suffix) ##### PPO change ##### self.ppo_ratio = tf.placeholder(tf.float32, [ None, ], name='ppo_ratio' + self.suffix) ##### PPO change ##### layer = tf.layers.dense( inputs=self.tf_obs, units=128, activation=tf.nn.tanh, # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), kernel_initializer=tf.orthogonal_initializer( gain=np.sqrt(2.)), # ppo default initialization bias_initializer=tf.constant_initializer(0.1), name='fc1' + self.suffix) all_act = tf.layers.dense( inputs=layer, units=self.n_actions, activation=None, # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), kernel_initializer=tf.orthogonal_initializer( gain=np.sqrt(2.)), # ppo default initialization bias_initializer=tf.constant_initializer(0.1), name='fc2' + self.suffix) self.trainable_variables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor' + self.suffix) self.trainable_variables_shapes = [ var.get_shape().as_list() for var in self.trainable_variables ] # sampling self.all_act_prob = tf.nn.softmax(all_act, name='act_prob' + self.suffix) self.all_act_prob = tf.clip_by_value(self.all_act_prob, 1e-20, 1.0) with tf.name_scope('loss' + self.suffix): neg_log_prob = tf.reduce_sum( -tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * tf.one_hot(indices=self.tf_acts, depth=self.n_actions), axis=1) loss = tf.reduce_mean(neg_log_prob * self.tf_vt) loss += self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) self.entro = self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) self.loss = loss with tf.name_scope('train' + self.suffix): self.train_op = tf.train.AdamOptimizer( self.pg_lr).minimize(loss) # safety loss """ * -1? """ self.chosen_action_log_probs = tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * tf.one_hot(indices=self.tf_acts, depth=self.n_actions), axis=1) ##### PPO CHANGE ##### self.ppo_old_chosen_action_log_probs = tf.placeholder( tf.float32, [None]) ##### PPO CHANGE ##### self.old_chosen_action_log_probs = tf.stop_gradient( tf.placeholder(tf.float32, [None])) # self.each_safety_loss = tf.exp(self.chosen_action_log_probs - self.old_chosen_action_log_probs) * self.tf_safe self.each_safety_loss = ( tf.exp(self.chosen_action_log_probs) - tf.exp(self.old_chosen_action_log_probs)) * self.tf_safe self.average_safety_loss = tf.reduce_mean( self.each_safety_loss) #/ self.n_episodes tf.reduce_sum # self.average_safety_loss +=self.entro # KL D self.old_all_act_prob = tf.stop_gradient( tf.placeholder(tf.float32, [None, self.n_actions])) def kl(x, y): EPS = 1e-10 x = tf.where(tf.abs(x) < EPS, EPS * tf.ones_like(x), x) y = tf.where(tf.abs(y) < EPS, EPS * tf.ones_like(y), y) X = tf.distributions.Categorical(probs=x + EPS) Y = tf.distributions.Categorical(probs=y + EPS) return tf.distributions.kl_divergence(X, Y, allow_nan_stats=False) self.each_kl_divergence = kl( self.all_act_prob, self.old_all_act_prob ) # tf.reduce_sum(kl(self.all_act_prob, self.old_all_act_prob), axis=1) self.average_kl_divergence = tf.reduce_mean( self.each_kl_divergence) # self.kl_gradients = tf.gradients(self.average_kl_divergence, self.trainable_variables) # useless self.desired_kl = desired_kl # self.metrics = [self.loss, self.average_kl_divergence, self.average_safety_loss, self.entro] # Luping self.metrics = [ self.loss, self.loss, self.average_safety_loss, self.entro ] # Luping # FLat self.flat_params_op = get_flat_params(self.trainable_variables) """not use tensorflow default function, here we calculate the gradient by self: (1) loss: g (2) kl: directional_gradients (math, fisher) (3) safe: b """ ##### PPO change ##### #### PPO Suyi's Change #### with tf.name_scope('ppoloss' + self.suffix): self.ppo_ratio = tf.exp(self.chosen_action_log_probs - self.ppo_old_chosen_action_log_probs) # self.ppo_ratio = tf.Print(self.ppo_ratio, [self.ppo_ratio], "self.ppo_ratio: ") surr = self.ppo_ratio * self.tf_vt self.ppoloss = -tf.reduce_mean( tf.minimum( surr, tf.clip_by_value(self.ppo_ratio, 1. - self.clip_eps, 1. + self.clip_eps) * self.tf_vt)) self.ppoloss += self.entropy_weight * tf.reduce_mean( tf.reduce_sum( tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) # self.ppoloss += 0.01 * tf.reduce_mean(tf.reduce_sum(tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1)) with tf.variable_scope('ppotrain'): # self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.ppoloss) self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize( self.ppoloss) #### PPO Suyi's Change #### self.ppoloss_flat_gradients_op = get_flat_gradients( self.ppoloss, self.trainable_variables) ##### PPO change ##### self.loss_flat_gradients_op = get_flat_gradients( self.loss, self.trainable_variables) self.kl_flat_gradients_op = get_flat_gradients( self.average_kl_divergence, self.trainable_variables) self.constraint_flat_gradients_op = get_flat_gradients( self.average_safety_loss, self.trainable_variables) self.vec = tf.placeholder(tf.float32, [None]) self.fisher_product_op = self.get_fisher_product_op() self.new_params = tf.placeholder(tf.float32, [None]) self.params_assign_op = assign_network_params_op( self.new_params, self.trainable_variables, self.trainable_variables_shapes)