def decoder_train_save_restore_test(): # BeamSearchDecoder vocab_size = 6 SOS_token = 0 EOS_token = 5 # x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32) # y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32) # print("data shape: ", x_data.shape) index_to_char = { SOS_token: '<S>', 1: 'h', 2: 'e', 3: 'l', 4: 'o', EOS_token: '<E>' } x_data = np.array([[SOS_token, 1, 2, 3, 3, 4]], dtype=np.int32) y_data = np.array([[1, 2, 3, 3, 4, EOS_token]], dtype=np.int32) output_dim = vocab_size batch_size = len(x_data) hidden_dim = 7 seq_length = x_data.shape[1] embedding_dim = 8 embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, trainable=True) ##### embedding.weights, embedding.trainable_variables, embedding.trainable_weights --> 모두 같은 결과 target = tf.convert_to_tensor(y_data) # Decoder method = 1 if method == 1: # single layer RNN decoder_cell = tf.keras.layers.LSTMCell(hidden_dim) # decoder init state: #init_state = [tf.zeros((batch_size,hidden_dim)), tf.ones((batch_size,hidden_dim))] # (h,c) init_state = decoder_cell.get_initial_state(inputs=None, batch_size=batch_size, dtype=tf.float32) else: # multi layer RNN decoder_cell = tf.keras.layers.StackedRNNCells([ tf.keras.layers.LSTMCell(hidden_dim), tf.keras.layers.LSTMCell(2 * hidden_dim) ]) init_state = decoder_cell.get_initial_state(inputs=tf.zeros_like( x_data, dtype=tf.float32)) # inputs의 batch_size만 참조하기 때문에 projection_layer = tf.keras.layers.Dense(output_dim) # train용 Sampler로 TrainingSampler 또는 ScheduledEmbeddingTrainingSampler 선택. sampler = tfa.seq2seq.sampler.TrainingSampler( ) # alias ---> sampler = tfa.seq2seq.TrainingSampler() #sampler = tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler(sampling_probability=0.2) decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer) optimizer = tf.keras.optimizers.Adam(lr=0.01) inputs = tf.keras.Input(shape=(seq_length)) embedded = embedding(inputs) embedded = tf.reshape(embedded, [batch_size, seq_length, embedding_dim]) if isinstance(sampler, tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler): outputs, last_state, last_sequence_lengths = decoder( embedded, initial_state=init_state, sequence_length=[seq_length] * batch_size, training=True, embedding=embedding.weights) else: outputs, last_state, last_sequence_lengths = decoder( embedded, initial_state=init_state, sequence_length=[seq_length] * batch_size, training=True) outputs, last_state, last_sequence_lengths = decoder( embedded, initial_state=init_state, sequence_length=[seq_length] * batch_size, training=True) model = tf.keras.Model( inputs, [outputs, last_state, last_sequence_lengths] ) # model.layers ---> [<tensorflow.python.keras.engine.input_layer.InputLayer object at 0x0000000014BC3A20>, <tensorflow.python.keras.layers.embeddings.Embedding object at 0x000000000464AB00>, <tensorflow.python.keras.engine.base_layer.TensorFlowOpLayer object at 0x0000000014C31F98>, <tensorflow_addons.seq2seq.basic_decoder.BasicDecoder object at 0x0000000014BC37F0>] print(model.summary()) train_mode = False if train_mode: for step in range(500): with tf.GradientTape() as tape: outputs, last_state, last_sequence_lengths = model(x_data) weights = tf.ones(shape=[batch_size, seq_length]) loss = tfa.seq2seq.sequence_loss(outputs.rnn_output, target, weights) trainable_variables = embedding.trainable_variables + decoder.trainable_variables # 매번 update되어야 한다. grads = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(grads, trainable_variables)) if step % 10 == 0: print(step, loss.numpy()) model.save_weights( './saved_model/model_ckpt') # tf.saved_model.save로 하면 안돈다. else: model = model.load_weights('./saved_model/model_ckpt') sample_batch_size = 5 decoder_type = 1 if decoder_type == 1: # GreedyEmbeddingSampler or SampleEmbeddingSampler() # sampler 선택 가능. sampler = tfa.seq2seq.GreedyEmbeddingSampler( ) # alias ---> sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler #sampler = tfa.seq2seq.GreedyEmbeddingSampler(embedding_fn=lambda ids: tf.nn.embedding_lookup(embedding.weights, ids)) # embedding_fn을 넘겨줄 수도 ㅣ있다. #sampler = tfa.seq2seq.SampleEmbeddingSampler() decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer, maximum_iterations=seq_length) if method == 1: # single layer init_state = decoder_cell.get_initial_state( inputs=None, batch_size=sample_batch_size, dtype=tf.float32) else: # multi layer init_state = decoder_cell.get_initial_state(inputs=tf.zeros( [sample_batch_size, hidden_dim], dtype=tf.float32)) else: # Beam Search beam_width = 2 decoder = tfa.seq2seq.BeamSearchDecoder( decoder_cell, beam_width, output_layer=projection_layer, maximum_iterations=seq_length) # 2가지 방법은 같은 결과를 준다. if method == 1: #init_state = decoder_cell.get_initial_state(inputs=None, batch_size=sample_batch_size*beam_width, dtype=tf.float32) init_state = tfa.seq2seq.tile_batch( decoder_cell.get_initial_state( inputs=None, batch_size=sample_batch_size, dtype=tf.float32), multiplier=beam_width) else: #init_state = decoder_cell.get_initial_state(inputs=tf.zeros([sample_batch_size*beam_width,hidden_dim],dtype=tf.float32)) init_state = tfa.seq2seq.tile_batch( decoder_cell.get_initial_state(inputs=tf.zeros( [sample_batch_size, hidden_dim], dtype=tf.float32)), multiplier=beam_width) outputs, last_state, last_sequence_lengths = decoder( embedding.weights, initial_state=init_state, start_tokens=tf.tile([SOS_token], [sample_batch_size]), end_token=EOS_token, training=False) if decoder_type == 1: result = tf.argmax(outputs.rnn_output, axis=-1).numpy() print(result) for i in range(sample_batch_size): print(''.join(index_to_char[a] for a in result[i] if a != EOS_token)) else: result = outputs.predicted_ids.numpy() print(result.shape) for i in range(sample_batch_size): print(i, ) for j in range(beam_width): print(''.join(index_to_char[a] for a in result[i, :, j] if a != EOS_token))
def train(self, epochs, augmentation=True, plot_progress=False, plot_interval=50, save_backups=True, warm_up=False): assert self.original != [], 'Training dataset was not loaded, use load_training_dataset() first' for e in range(epochs): for i in range(self.iters_per_epoch): x, y = self.generate_input(augmentation=augmentation) x = x * 2 - 1 y = y * 2 - 1 if warm_up: y = x if i % plot_interval == 0 and plot_progress: plt.close() fig, ax = plt.subplots(1, 3, sharex=True, figsize=(16.5, 16.5)) if self.mode == 'RGB': ax[0].imshow((x[0] + 1) / 2) ax[0].set_title('Original') ax[1].imshow((self.G(x)[0] + 1) / 2) ax[1].set_title('Starless') ax[2].imshow((y[0] + 1) / 2) ax[2].set_title('Target') else: ax[0].imshow((x[0, :, :, 0] + 1) / 2, cmap='gray', vmin=0, vmax=1) ax[0].set_title('Original') ax[1].imshow((self.G(x)[0, :, :, 0] + 1) / 2, cmap='gray', vmin=0, vmax=1) ax[1].set_title('Starless') ax[2].imshow((y[0, :, :, 0] + 1) / 2, cmap='gray', vmin=0, vmax=1) ax[2].set_title('Target') display.clear_output(wait=True) display.display(plt.gcf()) if i > 0: print("\rEpoch: %d. Iteration %d / %d Loss %f " % (e, i, self.iters_per_epoch, self.history['total'][-1]), end='') else: print("\rEpoch: %d. Iteration %d / %d " % (e, i, self.iters_per_epoch), end='') with tf.GradientTape() as gen_tape, tf.GradientTape( ) as dis_tape: gen_output = self.G(x) p1_real, p2_real, p3_real, p4_real, p5_real, p6_real, p7_real, p8_real, predict_real = self.D( y) p1_fake, p2_fake, p3_fake, p4_fake, p5_fake, p6_fake, p7_fake, p8_fake, predict_fake = self.D( gen_output) d = {} dis_loss = tf.reduce_mean( -(tf.math.log(predict_real + 1E-8) + tf.math.log(1 - predict_fake + 1E-8))) d['dis_loss'] = dis_loss gen_loss_GAN = tf.reduce_mean(-tf.math.log(predict_fake + 1E-8)) d['gen_loss_GAN'] = gen_loss_GAN gen_p1 = tf.reduce_mean(tf.abs(p1_fake - p1_real)) d['gen_p1'] = gen_p1 gen_p2 = tf.reduce_mean(tf.abs(p2_fake - p2_real)) d['gen_p2'] = gen_p2 gen_p3 = tf.reduce_mean(tf.abs(p3_fake - p3_real)) d['gen_p3'] = gen_p3 gen_p4 = tf.reduce_mean(tf.abs(p4_fake - p4_real)) d['gen_p4'] = gen_p4 gen_p5 = tf.reduce_mean(tf.abs(p5_fake - p5_real)) d['gen_p5'] = gen_p5 gen_p6 = tf.reduce_mean(tf.abs(p6_fake - p6_real)) d['gen_p6'] = gen_p6 gen_p7 = tf.reduce_mean(tf.abs(p7_fake - p7_real)) d['gen_p7'] = gen_p7 gen_p8 = tf.reduce_mean(tf.abs(p8_fake - p8_real)) d['gen_p8'] = gen_p8 gen_L1 = tf.reduce_mean(tf.abs(y - gen_output)) d['gen_L1'] = gen_L1 * 100 gen_loss = gen_loss_GAN * 0.1 + gen_p1 * 0.1 + gen_p2 * 10 + gen_p3 * 10 + gen_p4 * 10 + gen_p5 * 10 + gen_p6 * 10 + gen_p7 * 10 + gen_p8 * 10 + gen_L1 * 100 d['total'] = gen_loss for k in d: if k in self.history.keys(): self.history[k].append(d[k] * (1 - self._ema) + self.history[k][-1] * self._ema) else: self.history[k] = [d[k]] gen_grads = gen_tape.gradient(gen_loss, self.G.trainable_variables) self.gen_optimizer.apply_gradients( zip(gen_grads, self.G.trainable_variables)) dis_grads = dis_tape.gradient(dis_loss, self.D.trainable_variables) self.dis_optimizer.apply_gradients( zip(dis_grads, self.D.trainable_variables)) if save_backups: if e % 2 == 0: self.G.save_weights("./starnet_backup_G_even.h5") self.D.save_weights("./starnet_backup_D_even.h5") else: self.G.save_weights("./starnet_backup_G_odd.h5") self.D.save_weights("./starnet_backup_D_odd.h5") if plot_progress: plt.close()
def run_example(filter_type, loss, out_dir, batch_size, hetero_q, hetero_r, learned_process, image_size, use_gpu, debug): """ Exemplary code to set up and train a differentiable filter for the simulated disc tracking task described in the paper "How to train your Differentiable FIlter" Parameters ---------- filter_type : str Defines which filtering algorithm is used. Can be ekf, ukf, mcukf or pf loss : str Which loss to use for training the filter. This can be "nll" for the negative log likelihood, "mse" for the mean squared error or "mixed" for a combination of both out_dir : str Path to the directory where results and data should be written to. batch_size : int Batch size for training and testing. hetero_q : bool If true, heteroscedastic process noise is learned, else constant. hetero_r : bool If true, heteroscedastic observation noise is learned, else constant. learned_process : bool If true, a neural network is used as process model in the filter, else an analytical process model is used. image_size : int Width and height of the image observations use_gpu : bool If true, the training and testing is run on GPU (if one is available) debug : bool Turns on additional debug output and prints. Returns ------- None. """ if use_gpu: # limit tensorflows gpuy memory consumption gpus = tf.config.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) else: # Hide GPU from visible devices to run on cpu tf.config.set_visible_devices([], 'GPU') # prepare the output directories if not os.path.exists(out_dir): os.makedirs(out_dir) train_dir = os.path.join(out_dir + '/train') data_dir = os.path.join(out_dir + '/data') if not os.path.exists(train_dir): os.makedirs(train_dir) if not os.path.exists(data_dir): os.makedirs(data_dir) # create a small dataset (if it doesn't already exist) name = 'example' if not os.path.exists(os.path.join(data_dir, 'info_' + name + '.txt')): c = DiscTrackingData(name, data_dir, image_size, 1000, 30, 1000, rescale=1, debug=debug) c.create_dataset(15, 0, 0, 3.0) else: print('data already exists') # create a tensorflow model that combines a differentiable filter with a # problem context model = FilterApplication(filter_type, loss, batch_size, hetero_q, hetero_r, learned_process, image_size, debug=debug) # Load training and test datasets # we use sequence length 10 for training and validation and sequence # length 30 for testing train_files, val_files, test_files = load_data(data_dir, name) train_set = tf.data.TFRecordDataset(train_files) train_set = model.preprocess(data_dir, name, train_set, 'train', 10) train_set = train_set.shuffle(500) train_set = train_set.batch(batch_size, drop_remainder=True) val_set = tf.data.TFRecordDataset(val_files) val_set = model.preprocess(data_dir, name, val_set, 'val', 10) val_set = val_set.batch(batch_size, drop_remainder=True) test_set = tf.data.TFRecordDataset(test_files) test_set = model.preprocess(data_dir, name, test_set, 'test', 30) test_set = test_set.batch(batch_size, drop_remainder=True) # prepare the training optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) epochs = 3 step = 0 # prepare a summary writer for logging information that can be viewed # with tensorboard train_summary_writer = tf.summary.create_file_writer(train_dir + '/' + str(time.time())) tf.summary.experimental.set_step(step) # unfortunately, we cannot use keras model.fit here, since keras currently # does not support loss functions that receive multiple output tensors # (like mean and covariance of the filter's belief for computing the nll # loss) We thus write a custom training loop print("\n Start training with sequence length 10") for epoch in range(epochs): print("\nStart of epoch %d \n" % (epoch)) print("Validating ...") evaluate(model, val_set, batch_size) for (x_batch_train, y_batch_train) in train_set: start = time.time() with tf.GradientTape() as tape: # sample a random disturbance of the initial state from the # initial covariance n_val = np.random.normal(loc=np.zeros((model.dim_x)), scale=model.initial_covariance, size=(batch_size, model.dim_x)) x_batch_train = (*x_batch_train, n_val) # Run the forward pass of the model with train_summary_writer.as_default(): out = model(x_batch_train, training=False) # Compute the loss value for this minibatch. loss_value, metrics, metric_names = \ model.context.get_loss(y_batch_train, out) # log summaries of the metrics every 50 steps with train_summary_writer.as_default(): with tf.summary.record_if(step%50==0): for i, name in enumerate(metric_names): tf.summary.scalar('metrics/' + name, tf.reduce_mean(metrics[i])) # Use the gradient tape to automatically retrieve the # gradients of the trainable variables with respect to the loss. grads = tape.gradient(loss_value, model.trainable_weights) # Run one step of gradient descent by updating # the value of the variables to minimize the loss. optimizer.apply_gradients(zip(grads, model.trainable_weights)) end = time.time() # Log every 50 batches. if step % 50 == 0: print("Training loss at step %d: %.4f (took %.3f seconds) " % (step, float(loss_value), float(end-start))) step += 1 tf.summary.experimental.set_step(step) # test the trained model on the held out data print("\n Testing with sequence length 30") evaluate(model, test_set, batch_size)
import tensorflow as tf # 创建4个张量 a = tf.constant(1.) b = tf.constant(2.) c = tf.constant(3.) w = tf.constant(4.) with tf.GradientTape() as tape: # 构建梯度环境 tape.watch([w]) # 将w加入梯度跟踪列表 # 构建计算过程 y = a * w**2 + b * w + c # 求导 [dy_dw] = tape.gradient(y, [w]) print(dy_dw)
def train_epoch(self, train, learning_rate, args): self.optimizer.learning_rate = learning_rate skipped = 0 num_gradients = 0 while not train.epoch_finished(): batch = train.next_batch(args.batch_size) sentences = batch["sentences"] layers = [] for sentence in sentences: layers.append(get_layers(sentence)) for accuracy in self.accuracies.values(): accuracy.reset_states() with tf.GradientTape() as tape: encoded_tokens = self._encoder( token_ids=batch["batch_factors"][Sentence.TOKENS], token_charseqs=batch["charseqs"], token_charseq_ids=batch["charseq_ids"], token_values=batch["batch_factors"][Sentence.TOKEN_VALUES], token_additionals=[batch[x] for x in ["bert", "fasttext", "word2vec"] if x in batch], training=True) states = [] for i, sentence in enumerate(sentences): states.append(self.State(self, sentence.empty_copy(), encoded_tokens[i][:sentence.n_tokens()], True)) loss = 0 for iteration in range(args.decoder_iterations): # 1) New nodes ori_nodes = [s.n_nodes for s in states] sum_nodes = sum(ori_nodes) nodes = tf.concat([s.nodes for s in states], axis=0) target_ops = np.zeros([sum_nodes], np.int32) target_node_values = np.zeros([sum_nodes, len(train.node_properties)], np.int32) target_node_values_mask = np.zeros([sum_nodes, 1], np.int32) # target_edge_values = np.zeros([sum_nodes, len(train.edge_properties)], np.int32) # target_edge_values_mask = np.zeros([sum_nodes, 1], np.int32) start = 0 for i in range(len(sentences)): if iteration < len(layers[i]): state, layer, sentence = states[i], layers[i][iteration], sentences[i] for n, target, edge in layer: n = state.node_mapping[n] target_ops[start + n] = NODE_PARENT if sentence.factors[Sentence.EDGE_PARENTS][edge] == target else NODE_CHILD target_node_values[start + n] = sentence.factors[Sentence.NODE_VALUES][target - sentence.n_tokens()] target_node_values_mask[start + n] = 1 # target_edge_values[start + n] = sentence.factors[Sentence.EDGE_VALUES][edge] # target_edge_values_mask[start + n] = 1 if n >= state.sentence.n_tokens() else 0 state.add_node(target_node_values[start + n], n, target) start += states[i].n_nodes_cached self.State.recompute_nodes(nodes, states, iteration) predictions = self.layers.decoder_node_operation[iteration](nodes) loss += self.loss_sce(target_ops, predictions) self.accuracies["node/ops"](target_ops, predictions) for i, prop in enumerate(train.node_properties): predictions = self.layers.decoder_node_values[iteration][i](nodes) loss += self.loss_sce(target_node_values[:, i], predictions, target_node_values_mask) self.accuracies["node/" + prop](target_node_values[:, i], predictions, target_node_values_mask) # for i, prop in enumerate(train.edge_properties): # predictions = self.layers.decoder_edge_values[iteration][i](nodes) # loss += self.loss_sce(target_edge_values[:, i], predictions, target_edge_values_mask) # self.accuracies["edge/" + prop](target_edge_values[:, i], predictions, target_edge_values_mask) # 2) Edges sum_nodes = sum(s.n_nodes for s in states) nodes = tf.concat([s.nodes for s in states], axis=0) target_indices_a, target_indices_b = [], [] target_a_parent, target_a_child = [], [] target_deprel_parents, target_deprel_children = [], [] target_deprel_values = [] start = 0 for i in range(len(sentences)): state, sentence = states[i], sentences[i] offset = len(target_indices_a) for j in range(ori_nodes[i], state.n_nodes): target_indices_a.append(start + np.repeat(np.int32(j), state.n_nodes)) target_indices_b.append(start + np.arange(0, state.n_nodes, dtype=np.int32)) target_a_parent.append(np.zeros(state.n_nodes, np.float32)) target_a_child.append(np.zeros(state.n_nodes, np.float32)) for n_ori, n in state.node_mapping.items(): if n >= ori_nodes[i]: for e in sentence.parents[n_ori]: if sentence.factors[Sentence.EDGE_VALUES][e][0] == Mapping.ROOT: continue p = sentence.factors[sentence.EDGE_PARENTS][e] if p in state.node_mapping: p = state.node_mapping[p] if not args.no_anchors or not sentence.factors[Sentence.EDGE_VALUES][e][0] == Mapping.ANCHOR: target_a_child[offset + n - ori_nodes[i]][p] = 1 target_deprel_parents.append(start + p) target_deprel_children.append(start + n) target_deprel_values.append(sentence.factors[Sentence.EDGE_VALUES][e]) state.add_edge(p, n, target_deprel_values[-1]) for e in sentence.children[n_ori]: if sentence.factors[Sentence.EDGE_VALUES][e][0] == Mapping.ROOT: continue c = sentence.factors[sentence.EDGE_CHILDREN][e] if c in state.node_mapping: c = state.node_mapping[c] if not args.no_anchors or not sentence.factors[Sentence.EDGE_VALUES][e][0] == Mapping.ANCHOR: target_a_parent[offset + n-ori_nodes[i]][c] = 1 target_deprel_parents.append(start + n) target_deprel_children.append(start + c) target_deprel_values.append(sentence.factors[Sentence.EDGE_VALUES][e]) state.add_edge(n, c, target_deprel_values[-1]) start += state.n_nodes if not target_indices_a or not target_deprel_parents: continue target_indices_a = np.concatenate(target_indices_a, axis=0) target_indices_b = np.concatenate(target_indices_b, axis=0) target_a_parent = np.concatenate(target_a_parent, axis=0) target_a_child = np.concatenate(target_a_child, axis=0) target_deprel_parents = np.array(target_deprel_parents, np.int32) target_deprel_children = np.array(target_deprel_children, np.int32) target_deprel_values = np.array(target_deprel_values, np.int32) # 2.1) Compute arcs edge_parents = self.layers.decoder_edge_parents[iteration](nodes) edge_children = self.layers.decoder_edge_children[iteration](nodes) a_parent = tf.nn.tanh(self.layers.sum([tf.gather(edge_parents, target_indices_a), tf.gather(edge_children, target_indices_b)])) if args.highway: a_parent += self.layers.decoder_edge_highway[iteration](a_parent) a_parent = self.layers.decoder_edge_arc[iteration](a_parent) loss += self.loss_sce(target_a_parent, a_parent) self.accuracies["edge/arc"](target_a_parent, a_parent) a_child = tf.nn.tanh(self.layers.sum([tf.gather(edge_parents, target_indices_b), tf.gather(edge_children, target_indices_a)])) if args.highway: a_child += self.layers.decoder_edge_highway[iteration](a_child) a_child = self.layers.decoder_edge_arc[iteration](a_child) loss += self.loss_sce(target_a_child, a_child) self.accuracies["edge/arc"](target_a_child, a_child) # 2.2) Compute deprels deprel_parents = self.layers.decoder_deprel_parents[iteration](nodes) deprel_children = self.layers.decoder_deprel_children[iteration](nodes) deprel_weights = tf.nn.tanh(self.layers.sum( [tf.gather(deprel_parents, target_deprel_parents), tf.gather(deprel_children, target_deprel_children)])) if args.highway: deprel_weights += self.layers.decoder_deprel_highway[iteration](deprel_weights) for i, prop in enumerate(train.edge_properties): predictions = self.layers.decoder_deprel_values[iteration][i](deprel_weights) loss += self.loss_sce(target_deprel_values[:, i], predictions) self.accuracies["edge/" + prop](target_deprel_values[:, i], predictions) self.State.recompute_edges(ori_nodes, states, iteration) # Tops sum_nodes = sum(s.n_nodes for s in states) nodes = tf.concat([s.nodes for s in states], axis=0) target_tops = np.zeros([sum_nodes], np.int32) start = 0 for i, sentence in enumerate(sentences): for e in sentence.children[0]: if sentence.factors[Sentence.EDGE_VALUES][e][0] == Mapping.ROOT: c = sentence.factors[Sentence.EDGE_CHILDREN][e] if c in states[i].node_mapping: target_tops[start + states[i].node_mapping[c]] = 1 start += states[i].n_nodes predictions = self.layers.decoder_tops(nodes) loss += self.loss_sce(target_tops, predictions) self.accuracies["edge/tops"](target_tops, predictions) tg = tape.gradient(loss, self.layers.trainable_variables) tg_none = [variable.name for g, variable in zip(tg, self.layers.trainable_variables) if g is None] if tg_none: print("Skipping a batch with None gradient for variables {}".format(tg_none), file=sys.stderr, flush=True) continue if num_gradients == 0: gradients = [g.numpy() if not isinstance(g, tf.IndexedSlices) else [(g.values.numpy(), g.indices.numpy())] for g in tg] else: for g, ng in zip(gradients, tg): if isinstance(g, list): g.append((ng.values.numpy(), ng.indices.numpy())) else: g += ng.numpy() num_gradients += 1 if num_gradients == args.batch_aggregation or len(train._permutation) == 0: gradients = [tf.IndexedSlices(*map(np.concatenate, zip(*g))) if isinstance(g, list) else g for g in gradients] self.optimizer.apply_gradients(zip(gradients, self.layers.trainable_variables)) num_gradients = 0 if int(self.optimizer.iterations) % 100 == 0: tf.summary.experimental.set_step(self._summary_step()) with self.writer.as_default(): for name, accuracy in self.accuracies.items(): tf.summary.scalar("train/" + name, accuracy.result()) tf.summary.experimental.set_step(self._summary_step()) with self.writer.as_default(): tf.summary.scalar("train/skipped", skipped)
def train_one_step(train_batch_i, bvae_model, genmo_optimizer, infnet_optimizer, prior_optimizer, theta_optimizer, encoder_grad_variable, encoder_grad_sq_variable, grad_variable_dict, grad_sq_variable_dict): """Train Discrete VAE for 1 step.""" metrics = {} input_batch = process_batch_input(train_batch_i) if FLAGS.grad_type == 'relax': with tf.GradientTape(persistent=True) as theta_tape: (genmo_grads, prior_grads, infnet_grads, genmo_loss) = estimate_gradients(input_batch, bvae_model, FLAGS.grad_type) # Update generative model genmo_vars = bvae_model.decoder_vars genmo_optimizer.apply_gradients(list(zip(genmo_grads, genmo_vars))) prior_vars = bvae_model.prior_vars prior_optimizer.apply_gradients(list(zip(prior_grads, prior_vars))) infnet_vars = bvae_model.encoder_vars infnet_optimizer.apply_gradients( list(zip(infnet_grads, infnet_vars))) infnet_grads_sq = [tf.square(grad_i) for grad_i in infnet_grads] theta_vars = [] if bvae_model.control_nn: theta_vars.extend(bvae_model.control_nn.trainable_variables) if FLAGS.temperature is None: theta_vars.append(bvae_model.log_temperature_variable) if FLAGS.scaling_factor is None: theta_vars.append(bvae_model.scaling_variable) theta_grads = theta_tape.gradient(infnet_grads_sq, theta_vars) theta_optimizer.apply_gradients(zip(theta_grads, theta_vars)) del theta_tape metrics['learning_signal'] = bvae_model.mean_learning_signal else: (genmo_grads, prior_grads, infnet_grads, genmo_loss) = estimate_gradients(input_batch, bvae_model, FLAGS.grad_type) genmo_vars = bvae_model.decoder_vars genmo_optimizer.apply_gradients(list(zip(genmo_grads, genmo_vars))) prior_vars = bvae_model.prior_vars prior_optimizer.apply_gradients(list(zip(prior_grads, prior_vars))) infnet_vars = bvae_model.encoder_vars infnet_optimizer.apply_gradients(list(zip(infnet_grads, infnet_vars))) batch_size_sq = tf.cast(FLAGS.batch_size * FLAGS.batch_size, tf.float32) encoder_grad_var = bvae_model.compute_grad_variance( encoder_grad_variable, encoder_grad_sq_variable, infnet_grads) / batch_size_sq if grad_variable_dict is not None: variance_dict = dict() for k in grad_variable_dict.keys(): encoder_grads = estimate_gradients(input_batch, bvae_model, gradient_type=k)[2] variance_dict['var/' + k] = bvae_model.compute_grad_variance( grad_variable_dict[k], grad_sq_variable_dict[k], encoder_grads) / batch_size_sq else: variance_dict = None return (encoder_grad_var, variance_dict, genmo_loss, metrics)
def fit_regression(network, hidden_bayes=False, same_noise=False, max_std=0.5, data="ian", save=False): # load data if data not in ALLOWED_DATA_CONFIGS: raise AssertionError( f"'data' has to be in {ALLOWED_DATA_CONFIGS} but was set to {data}." ) elif data == TOY_DATA: data = np.load("data/train_data_regression.npz") x_train = data["x_train"] y_train = data["y_train"] x_lim, y_lim = 4.5, 70.0 reg = 10.0 # regularization parameter lambda elif data == IAN_DATA: data = np.load("data/train_data_ian_regression.npz", allow_pickle=True) x_train = data["x_train"] y_train = data["y_train"] x_lim, y_lim = 12.0, 8.0 reg = 30 # regularization parameter lambda elif data == SAMPLE_DATA: n_samples = 20 toy_regression = ToyRegressionData() x_train, y_train = toy_regression.gen_data(n_samples) x_lim, y_lim = 4.5, 70.0 reg = 10.0 # regularization parameter lambda # choose network if network not in ALLOWED_NETWORK_CONFIGS: raise AssertionError( f"'network' has to be in {ALLOWED_NETWORK_CONFIGS} but was set to {network}." ) elif network == MNF: model = BNN_MNF(hidden_bayes=hidden_bayes, max_std=max_std) bayes = True elif network == BAYES_BY_BACKPROP: model = BNN_BBB(hidden_bayes=hidden_bayes, max_std=max_std) bayes = True elif network == DENSE: model = MLP() bayes = False epochs = 500 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(1e-2, epochs, 1e-6, power=0.5) opt = tf.keras.optimizers.Adam(learning_rate=learning_rate_fn) # initialize _, _ = loss_fn(y_train, x_train, model, bayes, reg, same_noise) train_losses = [] kl_losses = [] for i in range(epochs): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) loss, kl_loss = loss_fn(y_train, x_train, model, bayes, reg, same_noise) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) if same_noise: model.reset_noise() # sample new epsilons train_losses.append(loss) kl_losses.append(kl_loss) if i % int(10) == 0: print(f"Epoch: {i}, MSE: {loss}, KL-loss: {kl_loss}") plt.plot(range(epochs), train_losses) plt.plot(range(epochs), kl_losses) plt.legend(["Train loss", "KL loss"]) n_test = 500 x_test = np.linspace(-x_lim, x_lim, n_test).reshape(n_test, 1).astype('float32') if bayes: y_preds = [] for _ in range(20): y_pred = model(x_test) y_preds.append(y_pred) plt.figure(figsize=(10, 4)) y_preds = np.array(y_preds).reshape(20, n_test) y_preds_mean = np.mean(y_preds, axis=0) y_preds_std = np.std(y_preds, axis=0) plt.scatter(x_train, y_train, c="orangered") color_pred = (0.0, 101.0 / 255.0, 189.0 / 255.0) plt.plot(x_test, y_preds_mean, color=color_pred) plt.fill_between(x_test.reshape(n_test, ), y_preds_mean - y_preds_std, y_preds_mean + y_preds_std, alpha=0.25, color=color_pred) plt.fill_between(x_test.reshape(n_test, ), y_preds_mean - 2.0 * y_preds_std, y_preds_mean + 2.0 * y_preds_std, alpha=0.35, color=color_pred) plt.xlim(-x_lim, x_lim) plt.ylim(-y_lim, y_lim) plt.legend(["Mean function", "Observations"]) else: plt.figure(figsize=(10, 4)) y_pred = model(x_test) plt.scatter(x_train, y_train, c="orangered") color_pred = (0.0, 101.0 / 255.0, 189.0 / 255.0) plt.plot(x_test, y_pred, color=color_pred) plt.xlim(-x_lim, x_lim) plt.ylim(-y_lim, y_lim) plt.legend(["Mean function", "Observations"]) plt.tight_layout() if save: plt.savefig(f"plots/{network}.pdf") else: plt.show()
def train_step(graph, t_nodes, t_edges): with tf.GradientTape() as tape: out_gs = model(graph) loss = tf.reduce_mean(loss_function(out_gs, t_nodes, t_edges)) grads = tape.gradient(loss, model.trainable_variables) return loss, grads, out_gs
@tf.function(autograph=False) def get_hessian1(): var = var1 # var = var[indices] # var = [v.value() for v in var] with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape: tape.watch(var) preds = model(var) grads = tape.gradient(preds, var) grads = grads[indices] hessians = tape.jacobian(grads, var, experimental_use_pfor=True) hessians = hessians[:, indices] return grads, hessians with tf.GradientTape(watch_accessed_variables=False) as tape: y = var1.sparse_read(5) * 5. grad = tape.gradient(y, var1.sparse_read(5)) print(grad) class MyVar(tf.Variable): pass vars2 = [MyVar(val, dtype=tf.float64, validate_shape=False) for val in np.linspace(0, 10, nparams)] def assign2(values, variables): for i, var in enumerate(variables): var.assign(values[i], use_locking=False, read_value=False)
def train_step(inputs): with tf.GradientTape() as gen_tape, tf.GradientTape() as dis_tape: outputs = model(inputs) generation_A = outputs[0] generation_B = outputs[1] cycle_A = outputs[2] cycle_B = outputs[3] identity_A = outputs[4] identity_B = outputs[5] discrimination_A_real = outputs[6] discrimination_A_fake = outputs[7] discrimination_B_real = outputs[8] discrimination_B_fake = outputs[9] discrimination_A_dot_real = outputs[10] discrimination_A_dot_fake = outputs[11] discrimination_B_dot_real = outputs[12] discrimination_B_dot_fake = outputs[13] # Cycle loss. cycle_loss = l1_loss(inputs[0], cycle_A) + l1_loss(inputs[1], cycle_B) # Identity loss. identity_loss = l1_loss(inputs[0], identity_A) + l1_loss( inputs[1], identity_B) # Generator loss. generator_loss_A2B = l2_loss(tf.ones_like(discrimination_B_fake), discrimination_B_fake) generator_loss_B2A = l2_loss(tf.ones_like(discrimination_A_fake), discrimination_A_fake) two_step_generator_loss_A = l2_loss( tf.ones_like(discrimination_A_dot_fake), discrimination_A_dot_fake) two_step_generator_loss_B = l2_loss( tf.ones_like(discrimination_B_dot_fake), discrimination_B_dot_fake) generator_loss = generator_loss_A2B + generator_loss_B2A + two_step_generator_loss_A + \ two_step_generator_loss_B + hp.lambda_cycle * cycle_loss + hp.lambda_identity * identity_loss discriminator_loss_A_real = l2_loss( tf.ones_like(discrimination_A_real), discrimination_A_real) discriminator_loss_A_fake = l2_loss( tf.zeros_like(discrimination_A_fake), discrimination_A_fake) discriminator_loss_A = (discriminator_loss_A_real + discriminator_loss_A_fake) / 2 discriminator_loss_B_real = l2_loss( tf.ones_like(discrimination_B_real), discrimination_B_real) discriminator_loss_B_fake = l2_loss( tf.zeros_like(discrimination_B_fake), discrimination_B_fake) discriminator_loss_B = (discriminator_loss_B_real + discriminator_loss_B_fake) / 2 discriminator_loss_A_dot_real = l2_loss( tf.ones_like(discrimination_A_dot_real), discrimination_A_dot_real) discriminator_loss_A_dot_fake = l2_loss( tf.zeros_like(discrimination_A_dot_fake), discrimination_A_dot_fake) discriminator_loss_A_dot = (discriminator_loss_A_dot_real + discriminator_loss_A_dot_fake) / 2 discriminator_loss_B_dot_real = l2_loss( tf.ones_like(discrimination_B_dot_real), discrimination_B_dot_real) discriminator_loss_B_dot_fake = l2_loss( tf.zeros_like(discrimination_B_dot_fake), discrimination_B_dot_fake) discriminator_loss_B_dot = (discriminator_loss_B_dot_real + discriminator_loss_B_dot_fake) / 2 discriminator_loss = discriminator_loss_A + discriminator_loss_B + discriminator_loss_A_dot + \ discriminator_loss_B_dot generator_vars = model.generatorA2B.trainable_variables + model.generatorB2A.trainable_variables discriminator_vars = model.discriminator_A.trainable_variables + model.discriminator_B.trainable_variables + \ model.discriminator_A_dot.trainable_variables + model.discriminator_B_dot.trainable_variables grad_gen = gen_tape.gradient(generator_loss, sources=generator_vars) grad_dis = dis_tape.gradient(discriminator_loss, sources=discriminator_vars) generator_optimizer.apply_gradients(zip(grad_gen, generator_vars)) discriminator_optimizer.apply_gradients(zip(grad_dis, discriminator_vars)) gen_loss(generator_loss) disc_loss(discriminator_loss)
def main(): # [b, 32, 32, 3] => [b, 1, 1, 512] conv_net = Sequential(conv_layers) #第一部分,卷积层 fc_net = Sequential([ layers.Dense(256, activation=tf.nn.relu), #第二部分,全连接层 layers.Dense(128, activation=tf.nn.relu), layers.Dense(100, activation=None), ]) conv_net.build(input_shape=[None, 32, 32, 3]) fc_net.build(input_shape=[None, 512]) #第二部分的输入是第一部分的输出 optimizer = optimizers.Adam(lr=1e-4) # [1, 2] + [3, 4] => [1, 2, 3, 4] variables = conv_net.trainable_variables + fc_net.trainable_variables #需要求梯度的参数 for epoch in range(50): for step, (x,y) in enumerate(train_db): with tf.GradientTape() as tape: # [b, 32, 32, 3] => [b, 1, 1, 512] out = conv_net(x) # flatten, => [b, 512] out = tf.reshape(out, [-1, 512]) # [b, 512] => [b, 100] logits = fc_net(out) # [b] => [b, 100] y_onehot = tf.one_hot(y, depth=100) # compute loss loss = tf.losses.categorical_crossentropy(y_onehot, logits, from_logits=True) loss = tf.reduce_mean(loss) grads = tape.gradient(loss, variables) optimizer.apply_gradients(zip(grads, variables)) if step %100 == 0: print(epoch, step, 'loss:', float(loss)) total_num = 0 total_correct = 0 for x,y in test_db: out = conv_net(x) out = tf.reshape(out, [-1, 512]) logits = fc_net(out) prob = tf.nn.softmax(logits, axis=1) pred = tf.argmax(prob, axis=1) pred = tf.cast(pred, dtype=tf.int32) correct = tf.cast(tf.equal(pred, y), dtype=tf.int32) correct = tf.reduce_sum(correct) total_num += x.shape[0] total_correct += int(correct) acc = total_correct / total_num print(epoch, 'acc:', acc)
def main(_argv): if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True) anchors = yolo_anchors anchor_masks = yolo_anchor_masks train_dataset = dataset.load_fake_dataset() if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes) train_dataset = train_dataset.shuffle(buffer_size=1024) # TODO: not 1024 train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = dataset.load_fake_dataset() if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes) val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map( lambda x, y: (dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, 80))) if FLAGS.transfer != 'none': model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.mode == 'frozen': # freeze everything freeze_all(model) else: # reset top layers if FLAGS.tiny: # get initial weights init_model = YoloV3Tiny(FLAGS.size, training=True) else: init_model = YoloV3(FLAGS.size, training=True) if FLAGS.transfer == 'darknet': for l in model.layers: if l.name != 'yolo_darknet' and l.name.startswith('yolo_'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) elif FLAGS.transfer == 'no_output': for l in model.layers: if l.name.startswith('yolo_output'): l.set_weights( init_model.get_layer(l.name).get_weights()) else: freeze_all(l) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [YoloLoss(anchors[mask]) for mask in anchor_masks] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint('checkpoints/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def main(_argv): physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) if FLAGS.tiny: model = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_tiny_anchors anchor_masks = yolo_tiny_anchor_masks else: model = YoloV3(FLAGS.size, training=True, classes=FLAGS.num_classes) anchors = yolo_anchors anchor_masks = yolo_anchor_masks if FLAGS.dataset: train_dataset = dataset.load_tfrecord_dataset(FLAGS.dataset, FLAGS.classes, FLAGS.size) else: train_dataset = dataset.load_fake_dataset() train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(FLAGS.batch_size) train_dataset = train_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) if FLAGS.val_dataset: val_dataset = dataset.load_tfrecord_dataset(FLAGS.val_dataset, FLAGS.classes, FLAGS.size) else: val_dataset = dataset.load_fake_dataset() val_dataset = val_dataset.batch(FLAGS.batch_size) val_dataset = val_dataset.map(lambda x, y: ( dataset.transform_images(x, FLAGS.size), dataset.transform_targets(y, anchors, anchor_masks, FLAGS.size))) # Configure the model for transfer learning if FLAGS.transfer == 'none': pass # Nothing to do elif FLAGS.transfer in ['darknet', 'no_output']: # Darknet transfer is a special case that works # with incompatible number of classes # reset top layers if FLAGS.tiny: model_pretrained = YoloV3Tiny(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) else: model_pretrained = YoloV3(FLAGS.size, training=True, classes=FLAGS.weights_num_classes or FLAGS.num_classes) model_pretrained.load_weights(FLAGS.weights) if FLAGS.transfer == 'darknet': model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) elif FLAGS.transfer == 'no_output': for l in model.layers: if not l.name.startswith('yolo_output'): l.set_weights( model_pretrained.get_layer(l.name).get_weights()) freeze_all(l) else: # All other transfer require matching classes model.load_weights(FLAGS.weights) if FLAGS.transfer == 'fine_tune': # freeze darknet and fine tune other layers darknet = model.get_layer('yolo_darknet') freeze_all(darknet) elif FLAGS.transfer == 'frozen': # freeze everything freeze_all(model) optimizer = tf.keras.optimizers.Adam(lr=FLAGS.learning_rate) loss = [ YoloLoss(anchors[mask], classes=FLAGS.num_classes, ignore_thresh=FLAGS.ignore_threshold) for mask in anchor_masks ] if FLAGS.mode == 'eager_tf': # Eager mode is great for debugging # Non eager graph mode is recommended for real training avg_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32) avg_val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32) for epoch in range(1, FLAGS.epochs + 1): for batch, (images, labels) in enumerate(train_dataset): with tf.GradientTape() as tape: outputs = model(images, training=True) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) logging.info("{}_train_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_loss.update_state(total_loss) for batch, (images, labels) in enumerate(val_dataset): outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, loss): pred_loss.append(loss_fn(label, output)) total_loss = tf.reduce_sum(pred_loss) + regularization_loss logging.info("{}_val_{}, {}, {}".format( epoch, batch, total_loss.numpy(), list(map(lambda x: np.sum(x.numpy()), pred_loss)))) avg_val_loss.update_state(total_loss) logging.info("{}, train: {}, val: {}".format( epoch, avg_loss.result().numpy(), avg_val_loss.result().numpy())) avg_loss.reset_states() avg_val_loss.reset_states() model.save_weights('checkpoints/yolov3_train_{}.tf'.format(epoch)) else: model.compile(optimizer=optimizer, loss=loss, run_eagerly=(FLAGS.mode == 'eager_fit')) callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=FLAGS.patience, verbose=1), ModelCheckpoint(os.path.join(FLAGS.output_path, 'yolov3_train_best.tf'), monitor='val_loss', save_best_only=True, verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] history = model.fit(train_dataset, epochs=FLAGS.epochs, callbacks=callbacks, validation_data=val_dataset)
def InferenceSampler_test(): vocab_size = 6 SOS_token = 0 EOS_token = 5 # x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32) # y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32) # print("data shape: ", x_data.shape) index_to_char = { SOS_token: '<S>', 1: 'h', 2: 'e', 3: 'l', 4: 'o', EOS_token: '<E>' } x_data = np.array([[SOS_token, 1, 2, 3, 3, 4]], dtype=np.int32) y_data = np.array([[1, 2, 3, 3, 4, EOS_token]], dtype=np.int32) output_dim = vocab_size batch_size = len(x_data) hidden_dim = 7 seq_length = x_data.shape[1] embedding_dim = 8 init = np.arange(vocab_size * embedding_dim).reshape(vocab_size, -1) embedding = tf.keras.layers.Embedding( vocab_size, embedding_dim, embeddings_initializer=Constant(init), trainable=True) ##### embedding.weights, embedding.trainable_variables, embedding.trainable_weights --> 모두 같은 결과 target = tf.convert_to_tensor(y_data) # Decoder # single layer RNN decoder_cell = tf.keras.layers.LSTMCell(hidden_dim) # decoder init state: #init_state = [tf.zeros((batch_size,hidden_dim)), tf.ones((batch_size,hidden_dim))] # (h,c) init_state = decoder_cell.get_initial_state(inputs=None, batch_size=batch_size, dtype=tf.float32) projection_layer = tf.keras.layers.Dense(output_dim) sampler = tfa.seq2seq.sampler.TrainingSampler( ) # alias ---> sampler = tfa.seq2seq.TrainingSampler() decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer) optimizer = tf.keras.optimizers.Adam(lr=0.01) for step in range(500): with tf.GradientTape() as tape: inputs = embedding(x_data) if isinstance( sampler, tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler): outputs, last_state, last_sequence_lengths = decoder( inputs, initial_state=init_state, sequence_length=[seq_length] * batch_size, training=True, embedding=embedding.weights) else: outputs, last_state, last_sequence_lengths = decoder( inputs, initial_state=init_state, sequence_length=[seq_length] * batch_size, training=True) logits = outputs.rnn_output weights = tf.ones(shape=[batch_size, seq_length]) loss = tfa.seq2seq.sequence_loss(logits, target, weights) trainable_variables = embedding.trainable_variables + decoder.trainable_variables # 매번 update되어야 한다. grads = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(grads, trainable_variables)) if step % 10 == 0: print(step, loss.numpy()) sample_batch_size = 5 # InferenceSampler를 사용해 보자. # GreedyEmbedding Sampler를 구현했다. sampler = tfa.seq2seq.InferenceSampler( sample_fn=lambda outputs: tf.argmax( outputs, axis=-1, output_type=tf.int32), sample_shape=[], sample_dtype=tf.int32, end_fn=lambda sample_ids: tf.equal(sample_ids, EOS_token), next_inputs_fn=lambda ids: tf.nn.embedding_lookup( embedding.weights, ids)) decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer, maximum_iterations=seq_length) init_state = decoder_cell.get_initial_state(inputs=None, batch_size=sample_batch_size, dtype=tf.float32) start_inputs = tf.nn.embedding_lookup( embedding.weights, tf.tile([SOS_token], [sample_batch_size])) # embedding된 것을 넘겨주어야 한다. outputs, last_state, last_sequence_lengths = decoder( start_inputs, initial_state=init_state, training=False) result = tf.argmax(outputs.rnn_output, axis=-1).numpy() print(result) for i in range(sample_batch_size): print(''.join(index_to_char[a] for a in result[i] if a != EOS_token))
from mpl_toolkits.mplot3d import Axes3D def getZ(x,y): return (2*x**2+3*y+3)+(y**2-9*x+6) xrng=np.arange(-3,3,.1) yrng=np.arange(-4,4,.1) # 求曲面的最小值对应的坐标 x=tf.Variable(tf.random.truncated_normal([1])) y=tf.Variable(tf.random.truncated_normal([1])) rate=0.1 epoches=1000 for epoch in range(epoches): with tf.GradientTape(persistent=True) as tape: loss=getZ(x,y) grads=tape.gradient(loss,[x,y]) print('epoch={0},x={1},y={2},loss={3}'.format(epoch,x.numpy(),y.numpy(),loss.numpy())) x.assign_sub(rate*grads[0]) y.assign_sub(rate*grads[1]) # 设置交叉点 X,Y=np.meshgrid(xrng,yrng) Z=getZ(X,Y) # print(X.shape,Y.shape,Z.shape) # print(X,Y,Z)
# auto gradient import tensorflow as tf # create 4 tensor a = tf.constant(1.) b = tf.constant(2.) c = tf.constant(3.) w = tf.constant(4.) with tf.GradientTape() as tape: # create gradient environment tape.watch([w]) # add w to gradient trace list # compute process y = a * w**2 + b * w + c # calculate derivative [dy_dw] = tape.gradient(y, [w]) print(dy_dw) # print derivative
def train_txt_gen_rnn(train_dat, valid_dat, vocab, embed_dim, units, batch_size, seq_len, learn_rt, n_epochs, early_stop_epochs, save_loc): """train rnn to predict next words in the sequence""" start_tm = time.time() # Model Specification rnn = rnn_spec(dict_len=len(vocab), embed_dim=embed_dim, num_units=units) optimizer = tf.train.AdamOptimizer(learning_rate=learn_rt) rnn.build(tf.TensorShape([batch_size, seq_len])) valid_x, valid_y = next(iter(valid_dat)) # Early Stopping Placeholders best_val_loss = 999999 epoch_ph = [] epoch_tm_ph = [start_tm] trn_loss_ph = [] val_loss_ph = [] break_ph = [] # Iterative Training for epoch in range(n_epochs): # Train for (batch, (inp, target)) in enumerate(train_dat): with tf.GradientTape() as tape: train_predictions = rnn(inp) train_loss = loss_function(target, train_predictions) grads = tape.gradient(train_loss, rnn.variables) optimizer.apply_gradients(zip(grads, rnn.variables)) # Validation for (batch, (inp, target)) in enumerate(valid_dat): with tf.GradientTape() as tape: valid_predictions = rnn(valid_x) valid_loss = loss_function(valid_y, valid_predictions) # Record Epoch Results epoch_ph.append(epoch + 1) trn_loss_ph.append(train_loss) val_loss_ph.append(valid_loss) epoch_sec_elapsed = str( int((np.float64(time.time()) - np.float64(epoch_tm_ph[-1])))) pr_str1 = str('Ep. {} Loss: Train {:.4f} Val {:.4f}'.format( epoch + 1, train_loss, valid_loss)) print(pr_str1 + ' ' + epoch_sec_elapsed + ' sec.') epoch_tm_ph.append(time.time()) # Early Stopping best_val_loss = min(val_loss_ph) if (valid_loss > best_val_loss): break_ph.append(1) else: break_ph = [] # Model Saving checkpoint_prefix = os.path.join(save_loc, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=rnn) checkpoint.save(file_prefix=checkpoint_prefix) if sum(break_ph) >= early_stop_epochs: print("Stopping after " + str(int(epoch + 1)) + " epochs.") print("Validation cross entropy hasn't improved in " + str(int(early_stop_epochs)) + " rounds.") break # Output Training Progress output_df = pd.DataFrame({ 'Epoch': epoch_ph, 'Train Loss': trn_loss_ph, 'Validation Loss': val_loss_ph }) end_tm = time.time() sec_elapsed = (np.float64(end_tm) - np.float64(start_tm)) print('Execution Time: ' + seconds_to_time(sec_elapsed)) return output_df
def minimize( self, method='adam', coordinates=None, max_iter=10000, **kwargs): """ Minimize the energy. """ max_iter = tf.constant(max_iter, dtype=tf.int64) if type(coordinates) == type(None): coordinates = self.coordinates if method == 'adam': # put coordinates into a variable coordinates = tf.Variable(coordinates) # keep a history recent_ten = tf.zeros((10, ), dtype=tf.float32) # get the Adam optimizer optimizer = tf.keras.optimizers.Adam(1000) # init iter_idx = tf.constant(0, dtype=tf.int64) while tf.less(iter_idx, max_iter): with tf.GradientTape() as tape: energy = self.energy(coordinates) print(energy) recent_ten = tf.concat( [ recent_ten[1:], tf.expand_dims(energy, 0) ], axis=0) grad = tape.gradient(energy, coordinates) grad = tf.where( tf.math.is_nan(grad), tf.zeros_like(grad), grad) optimizer.apply_gradients(zip([grad], [coordinates])) if tf.logical_and( tf.greater(iter_idx, 100), tf.less( tf.math.reduce_std(recent_ten), 1e-3)): break iter_idx += 1 gin.i_o.to_sdf.write_sdf( [[ self.atoms, self.adjacency_map, tf.constant(10, dtype=tf.float32) * ( coordinates - tf.reduce_mean(coordinates, 0)) ]], 'caffeine_out.sdf')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--job-dir', required=True) parser.add_argument('--seed', default=67, type=int) args = parser.parse_args() print('args:', args) # create a job directory if it doesn't already exist if not os.path.exists(args.job_dir): os.makedirs(args.job_dir) # enable eager execution tf.enable_eager_execution() # set random seeds for consistent execution random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) # define hyperparameters params = Params() print('params:', params) # load MNIST dataset ((images_train, labels_train), (images_test, labels_test)) = tf.keras.datasets.mnist.load_data() # prepare the images by casting and rescaling images_train = prep_images(images_train) images_test = prep_images(images_test) # compute statistics from the training set images_loc = images_train.mean() images_scale = images_train.std() # define datasets for sampling batches dataset_train = get_dataset((images_train, labels_train), batch_size=params.batch_size, shuffle=True) dataset_test = get_dataset((images_test, labels_test), batch_size=params.batch_size) # model / optimization global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(learning_rate=params.learning_rate) model = Model(inputs_loc=images_loc, inputs_scale=images_scale, inputs_shape=[28, 28, 1]) latent_prior = tfp.distributions.MultivariateNormalDiag( loc=tf.zeros(shape=[2], dtype=tf.float32), scale_identity_multiplier=1.0) # checkpoints checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model, global_step=global_step) checkpoint_path = tf.train.latest_checkpoint(args.job_dir) if checkpoint_path is not None: checkpoint.restore(checkpoint_path).assert_consumed() # summaries summary_writer = tf.contrib.summary.create_file_writer(args.job_dir, max_queue=1, flush_millis=1000) summary_writer.set_as_default() with trange(params.epochs) as pbar: for epoch in pbar: loss_train = tfe.metrics.Mean(name='loss/train') for images, labels in dataset_train: with tf.GradientTape() as tape: outputs_dist, z_dist, z = model(images, labels, training=True) loss = losses.variational(outputs_dist, z_dist, images, latent_prior) loss_train(loss) grads = tape.gradient(loss, model.trainable_variables) grads_and_vars = zip(grads, model.trainable_variables) optimizer.apply_gradients(grads_and_vars, global_step=global_step) with tf.contrib.summary.always_record_summaries(): loss_train.result() tf.contrib.summary.scalar(name='grad_norm', tensor=tf.global_norm(grads)) tf.contrib.summary.image(name='image/train', tensor=images, max_images=1, step=global_step) tf.contrib.summary.image(name='outputs/train', tensor=outputs_dist.mean(), max_images=1, step=global_step) loss_test = tfe.metrics.Mean(name='loss/eval') for images, labels in dataset_test: outputs_dist, z_dist, z = model(images, labels) loss = losses.variational(outputs_dist, z_dist, images, latent_prior) loss_test(loss) with tf.contrib.summary.always_record_summaries(): loss_test.result() tf.contrib.summary.image(name='image/eval', tensor=images, max_images=1, step=global_step) tf.contrib.summary.image(name='outputs/eval', tensor=outputs_dist.mean(), max_images=1, step=global_step) pbar.set_description('loss (train): {}, loss (eval): {}'.format( loss_train.result().numpy(), loss_test.result().numpy())) checkpoint_prefix = os.path.join(args.job_dir, 'ckpt') checkpoint.save(checkpoint_prefix)
import tensorflow as tf tf.enable_eager_execution() x = tf.ones((2, 2)) with tf.GradientTape() as t: t.watch(x) y = tf.reduce_sum(x) z = tf.multiply(y, y) dz_dx = t.gradient(z, x) for i in [0, 1]: for j in [0, 1]: assert dz_dx[i][j].numpy() == 8.0 x = tf.ones((2, 2)) with tf.GradientTape() as t: t.watch(x) y = tf.reduce_sum(x) z = tf.multiply(y, y) dz_dy = t.gradient(z, y) assert dz_dy.numpy() == 8.0 x = tf.constant(3.0) with tf.GradientTape(persistent=True) as t: t.watch(x) y = x * x z = y * y
def grad(model, images, labels): with tf.GradientTape() as tape:#loss값을 gradienttape에 기록 loss = loss_fn(model, images, labels) return tape.gradient(loss, model.variables)#테이프를 거꾸로 감으면서 계산
def grad(x, y): with tf.GradientTape() as t: t.watch(x) out = f(x, y) return t.gradient(out, x)
lr = 0.005 epochs = 500 alpha = 0.03 # 神经网络构建 w1 = tf.Variable(tf.random.truncated_normal([2, 8], mean=0, stddev=0.1), dtype=tf.float32) b1 = tf.Variable(tf.constant(0.01, shape=[8])) w2 = tf.Variable(tf.random.truncated_normal([8, 1], mean=0, stddev=0.1), dtype=tf.float32) b2 = tf.Variable(tf.constant([0.01]), dtype=tf.float32) for epoch in range(0, epochs): for idx, (batch_x, batch_y) in enumerate(train_db): with tf.GradientTape() as tape: hidden = tf.matmul(batch_x, w1) + b1 hidden = tf.nn.relu(hidden) y = tf.matmul(hidden, w2) + b2 # wu # loss_mse = tf.reduce_mean(tf.square(batch_y - y)) # 使用l2正则 loss_mse = tf.reduce_mean(tf.square(batch_y - y)) # loss_l2 = tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) 或 loss_l2 = tf.reduce_sum([tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2)]) # 若alpha = 1, 无法正确画图 loss = loss_mse + alpha * loss_l2 grad = tape.gradient(loss, [w1, b1, w2, b2])
def compute_apply_gradients(model, x, optimizer): with tf.GradientTape() as tape: loss = compute_loss(model, x) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables))
def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets) return loss_value, tape.gradient(loss_value, model.trainable_variables)
def _train_step(self, images, kp2d, kp3d, has3d, theta): tf.keras.backend.set_learning_phase(1) batch_size = images.shape[0] with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape: generator_outputs = self.generator(images, training=True) # only use last computed theta (from iterative feedback loop) _, kp2d_pred, kp3d_pred, pose_pred, shape_pred, _ = generator_outputs[ -1] vis = tf.expand_dims(kp2d[:, :, 2], -1) kp2d_loss = v1_loss.absolute_difference(kp2d[:, :, :2], kp2d_pred, weights=vis) kp2d_loss = kp2d_loss * self.config.GENERATOR_2D_LOSS_WEIGHT if self.config.USE_3D: has3d = tf.expand_dims(has3d, -1) kp3d_real = batch_align_by_pelvis(kp3d) kp3d_pred = batch_align_by_pelvis( kp3d_pred[:, :self.config.NUM_KP3D, :]) kp3d_real = tf.reshape(kp3d_real, [batch_size, -1]) kp3d_pred = tf.reshape(kp3d_pred, [batch_size, -1]) kp3d_loss = v1_loss.mean_squared_error( kp3d_real, kp3d_pred, weights=has3d) * 0.5 kp3d_loss = kp3d_loss * self.config.GENERATOR_3D_LOSS_WEIGHT """Calculating pose and shape loss basically makes no sense due to missing paired 3d and mosh ground truth data. The original implementation has paired data for Human 3.6 M dataset which was not published due to licence conflict. Nevertheless with SMPLify paired data can be generated (see http://smplify.is.tue.mpg.de/ for more information) """ pose_pred = tf.reshape(pose_pred, [batch_size, -1]) shape_pred = tf.reshape(shape_pred, [batch_size, -1]) pose_shape_pred = tf.concat([pose_pred, shape_pred], 1) # fake ground truth has_smpl = tf.zeros(batch_size, tf.float32) # do not include loss has_smpl = tf.expand_dims(has_smpl, -1) pose_shape_real = tf.zeros(pose_shape_pred.shape) ps_loss = v1_loss.mean_squared_error( pose_shape_real, pose_shape_pred, weights=has_smpl) * 0.5 ps_loss = ps_loss * self.config.GENERATOR_3D_LOSS_WEIGHT # use all poses and shapes from iterative feedback loop fake_disc_input = self.accumulate_fake_disc_input( generator_outputs) fake_disc_output = self.discriminator(fake_disc_input, training=True) real_disc_input = self.accumulate_real_disc_input(theta) real_disc_output = self.discriminator(real_disc_input, training=True) gen_disc_loss = tf.reduce_mean( tf.reduce_sum((fake_disc_output - 1)**2, axis=1)) gen_disc_loss = gen_disc_loss * self.config.DISCRIMINATOR_LOSS_WEIGHT generator_loss = tf.reduce_sum([kp2d_loss, gen_disc_loss]) if self.config.USE_3D: generator_loss = tf.reduce_sum( [generator_loss, kp3d_loss, ps_loss]) disc_real_loss = tf.reduce_mean( tf.reduce_sum((real_disc_output - 1)**2, axis=1)) disc_fake_loss = tf.reduce_mean( tf.reduce_sum(fake_disc_output**2, axis=1)) discriminator_loss = tf.reduce_sum( [disc_real_loss, disc_fake_loss]) discriminator_loss = discriminator_loss * self.config.DISCRIMINATOR_LOSS_WEIGHT generator_grads = gen_tape.gradient(generator_loss, self.generator.trainable_variables) discriminator_grads = disc_tape.gradient( discriminator_loss, self.discriminator.trainable_variables) self.generator_opt.apply_gradients( zip(generator_grads, self.generator.trainable_variables)) self.discriminator_opt.apply_gradients( zip(discriminator_grads, self.discriminator.trainable_variables)) self.generator_loss_log.update_state(generator_loss) self.kp2d_loss_log.update_state(kp2d_loss) self.gen_disc_loss_log.update_state(gen_disc_loss) if self.config.USE_3D: self.kp3d_loss_log.update_state(kp3d_loss) self.pose_shape_loss_log.update_state(ps_loss) self.discriminator_loss_log.update_state(discriminator_loss) self.disc_real_loss_log.update_state(disc_real_loss) self.disc_fake_loss_log.update_state(disc_fake_loss)
import tensorflow as tf w = tf.Variable(tf.constant(5, dtype=tf.float32)) epoch = 40 LR_BASE = 0.2 # 最初学习率 LR_DECAY = 0.99 # 学习率衰减率 LR_STEP = 1 # 喂入多少轮BATCH_SIZE后,更新一次学习率 for epoch in range( epoch ): # for epoch 定义顶层循环,表示对数据集循环epoch次,此例数据集数据仅有1个w,初始化时候constant赋值为5,循环100次迭代。 lr = LR_BASE * LR_DECAY**(epoch / LR_STEP) with tf.GradientTape() as tape: # with结构到grads框起了梯度的计算过程。 loss = tf.square(w + 1) grads = tape.gradient(loss, w) # .gradient函数告知谁对谁求导 w.assign_sub( lr * grads) # .assign_sub 对变量做自减 即:w -= lr*grads 即 w = w - lr*grads print("After %s epoch,w is %f,loss is %f,lr is %f" % (epoch, w.numpy(), loss, lr))
def train(model, opt, original, target): with tf.GradientTape() as tape: gradients = tape.gradient(loss(model, original, target), model.trainable_variables) gradient_variables = zip(gradients, model.trainable_variables) opt.apply_gradients(gradient_variables)
def processData(device_index, start_samples, samples, federated, full_data_size, number_of_batches, parameter_server, sample_distribution): pause(5) # PS server (if any) starts first checkpointpath1 = 'results/model{}.h5'.format(device_index) outfile = 'results/dump_train_variables{}.npz'.format(device_index) outfile_models = 'results/dump_train_model{}.npy'.format(device_index) global_model = 'results/model_global.npy' global_epoch = 'results/epoch_global.npy' # np.random.seed(1) # tf.random.set_seed(1) # common initialization learning_rate = args.mu learning_rate_local = learning_rate B = np.ones((devices, devices)) - tf.one_hot(np.arange(devices), devices) Probabilities = B[device_index, :] / (devices - 1) training_signal = False # check for backup variables on start if os.path.isfile(checkpointpath1): train_start = False # backup the model and the model target model = models.load_model(checkpointpath1) data_history = [] label_history = [] local_model_parameters = np.load(outfile_models, allow_pickle=True) model.set_weights(local_model_parameters.tolist()) dump_vars = np.load(outfile, allow_pickle=True) frame_count = dump_vars['frame_count'] epoch_loss_history = dump_vars['epoch_loss_history'].tolist() running_loss = np.mean(epoch_loss_history[-5:]) epoch_count = dump_vars['epoch_count'] else: train_start = True model = create_q_model() data_history = [] label_history = [] frame_count = 0 # Experience replay buffers epoch_loss_history = [] epoch_count = 0 running_loss = math.inf if parameter_server: epoch_global = 0 training_end = False a = model.get_weights() # set an arbitrary optimizer, here Adam is used optimizer = keras.optimizers.Adam(learning_rate=args.mu, clipnorm=1.0) # create a data object (here radar data) #start = time.time() if args.noniid_assignment == 1: data_handle = RadarData_tasks(filepath, device_index, start_samples, samples, full_data_size) else: data_handle = RadarData(filepath, device_index, start_samples, samples, full_data_size, args.random_data_distribution) #end = time.time() #time_count = (end - start) #print(time_count) # create a consensus object cfa_consensus = CFA_process(devices, device_index, args.N) while True: # Run until solved # collect 1 batch frame_count += 1 obs, labels = data_handle.getTrainingData(batch_size) data_batch = preprocess_observation(obs, batch_size) # Save data and labels in the current learning session data_history.append(data_batch) label_history.append(labels) if frame_count % number_of_batches == 0: if not parameter_server: epoch_count += 1 # check scheduling for federated if federated: if epoch_count == 1 or scheduling_tx[device_index, epoch_count] == 1: training_signal = False else: # stop all computing, just save the previous model training_signal = True model_weights = np.asarray(model.get_weights()) model.save(checkpointpath1, include_optimizer=True, save_format='h5') np.savez(outfile, frame_count=frame_count, epoch_loss_history=epoch_loss_history, training_end=training_end, epoch_count=epoch_count, loss=running_loss) np.save(outfile_models, model_weights) # check scheduling for parameter server if parameter_server: while not os.path.isfile(global_epoch): # implementing consensus print("waiting") pause(1) try: epoch_global = np.load(global_epoch, allow_pickle=True) except: pause(5) print("retrying opening global epoch counter") try: epoch_global = np.load(global_epoch, allow_pickle=True) except: print("failed reading global epoch") if epoch_global == 0: training_signal = False elif scheduling_tx[device_index, epoch_global] == 1: if epoch_global > epoch_count: epoch_count = epoch_global training_signal = False else: training_signal = True else: # stop all computing, just save the previous model training_signal = True # always refresh the local model using the PS one stop_aggregation = False while not os.path.isfile(global_model): # implementing consensus print("waiting") pause(1) try: model_global = np.load(global_model, allow_pickle=True) except: pause(5) print("retrying opening global model") try: model_global = np.load(global_model, allow_pickle=True) except: print("halting aggregation") stop_aggregation = True if not stop_aggregation: model.set_weights(model_global.tolist()) if training_signal: model_weights = np.asarray(model.get_weights()) model.save(checkpointpath1, include_optimizer=True, save_format='h5') np.savez(outfile, frame_count=frame_count, epoch_loss_history=epoch_loss_history, training_end=training_end, epoch_count=epoch_count, loss=running_loss) np.save(outfile_models, model_weights) # check schedulting for parameter server # Local learning update every "number of batches" batches time_count = 0 if frame_count % number_of_batches == 0 and not training_signal: # run local batches for i in range(number_of_batches): start = time.time() data_sample = np.array(data_history[i]) label_sample = np.array(label_history[i]) # Create a mask to calculate loss masks = tf.one_hot(label_sample, n_outputs) with tf.GradientTape() as tape: # Train the model on data samples classes = model(data_sample, training=False) # Apply the masks # for k in range(batch_size): # class_v[k] = tf.argmax(classes[k]) # class_v = tf.reduce_sum(tf.multiply(classes, masks), axis=1) # Take best action # Calculate loss loss = loss_function(masks, classes) # Backpropagation grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) end = time.time() time_count = time_count + (end - start) / number_of_batches if not parameter_server and not federated: print('Average batch training time {:.2f}'.format(time_count)) del data_history del label_history data_history = [] label_history = [] model_weights = np.asarray(model.get_weights()) model.save(checkpointpath1, include_optimizer=True, save_format='h5') np.savez(outfile, frame_count=frame_count, epoch_loss_history=epoch_loss_history, training_end=training_end, epoch_count=epoch_count, loss=running_loss) np.save(outfile_models, model_weights) # Consensus round # update local model cfa_consensus.update_local_model(model_weights) # neighbor = cfa_consensus.get_connectivity(device_index, args.N, devices) # fixed neighbor np.random.seed(1) tf.random.set_seed(1) # common initialization if not train_start: if federated and not training_signal: eps_c = 1 / (args.N + 1) # apply consensus for model parameter # neighbor = np.random.choice(np.arange(devices), args.N, p=Probabilities, replace=False) # choose neighbor neighbor = np.random.choice( indexes_tx[:, epoch_count - 1], args.N, replace=False) # choose neighbor while neighbor == device_index: neighbor = np.random.choice( indexes_tx[:, epoch_count - 1], args.N, replace=False) # choose neighbor print( "Consensus from neighbor {} for device {}, local loss {:.2f}" .format(neighbor, device_index, loss.numpy())) model.set_weights( cfa_consensus.federated_weights_computing( neighbor, args.N, epoch_count, eps_c, max_lag)) if cfa_consensus.getTrainingStatusFromNeightbor(): # a neighbor completed the training, with loss < target, transfer learning is thus applied (the device will copy and reuse the same model) training_signal = True # stop local learning, just do validation else: print("Consensus warm up") train_start = False # check if parameter server is enabled # stop_aggregation = False # if parameter_server: # # pause(refresh_server) # while not os.path.isfile(global_model): # # implementing consensus # print("waiting") # pause(1) # try: # model_global = np.load(global_model, allow_pickle=True) # except: # pause(5) # print("retrying opening global model") # try: # model_global = np.load(global_model, allow_pickle=True) # except: # print("halting aggregation") # stop_aggregation = True # # if not stop_aggregation: # # print("updating from global model inside the parmeter server") # for k in range(cfa_consensus.layers): # # model_weights[k] = model_weights[k]+ 0.5*(model_global[k]-model_weights[k]) # model_weights[k] = model_global[k] # model.set_weights(model_weights.tolist()) # # while not os.path.isfile(global_epoch): # # implementing consensus # print("waiting") # pause(1) # try: # epoch_global = np.load(global_epoch, allow_pickle=True) # except: # pause(5) # print("retrying opening global epoch counter") # try: # epoch_global = np.load(global_epoch, allow_pickle=True) # except: # print("halting aggregation") del model_weights #start = time.time() # validation tool for device 'device_index' if epoch_count > validation_start and frame_count % number_of_batches == 0: avg_cost = 0. for i in range(number_of_batches_for_validation): obs_valid, labels_valid = data_handle.getTestData( batch_size, i) # obs_valid, labels_valid = data_handle.getRandomTestData(batch_size) data_valid = preprocess_observation(np.squeeze(obs_valid), batch_size) data_sample = np.array(data_valid) label_sample = np.array(labels_valid) # Create a mask to calculate loss masks = tf.one_hot(label_sample, n_outputs) classes = model(data_sample, training=False) # Apply the masks # class_v = tf.reduce_sum(tf.multiply(classes, masks), axis=1) # class_v = np.zeros(batch_size, dtype=int) # for k in range(batch_size): # class_v[k] = tf.argmax(classes[k]).numpy() # Calculate loss # loss = loss_function(label_sample, classes) loss = loss_function(masks, classes).numpy() avg_cost += loss / number_of_batches_for_validation # Training loss epoch_loss_history.append(avg_cost) print("Device {} epoch count {}, validation loss {:.2f}".format( device_index, epoch_count, avg_cost)) # mean loss for last 5 epochs running_loss = np.mean(epoch_loss_history[-1:]) #end = time.time() #time_count = (end - start) #print(time_count) if running_loss < target_loss: # Condition to consider the task solved print( "Solved for device {} at epoch {} with average loss {:.2f} !". format(device_index, epoch_count, running_loss)) training_end = True model_weights = np.asarray(model.get_weights()) model.save(checkpointpath1, include_optimizer=True, save_format='h5') # model_target.save(checkpointpath2, include_optimizer=True, save_format='h5') np.savez(outfile, frame_count=frame_count, epoch_loss_history=epoch_loss_history, training_end=training_end, epoch_count=epoch_count, loss=running_loss) np.save(outfile_models, model_weights) if federated: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "neighbors": args.N, "active_devices": args.Ka_consensus, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } elif parameter_server: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "active_devices": active_devices_per_round, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } else: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } if federated: sio.savemat( "results/matlab/CFA_device_{}_samples_{}_devices_{}_active_{}_neighbors_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(device_index, samples, devices, args.Ka_consensus, args.N, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) sio.savemat( "CFA_device_{}_samples_{}_devices_{}_neighbors_{}_batches_{}_size{}.mat" .format(device_index, samples, devices, args.N, number_of_batches, batch_size), dict_1) elif parameter_server: sio.savemat( "results/matlab/FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(device_index, samples, devices, active_devices_per_round, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) sio.savemat( "FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}.mat" .format(device_index, samples, devices, active_devices_per_round, number_of_batches, batch_size), dict_1) else: # CL sio.savemat( "results/matlab/CL_samples_{}_devices_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(samples, devices, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) break if epoch_count > max_epochs: # stop simulation print("Unsolved for device {} at epoch {}!".format( device_index, epoch_count)) training_end = True model_weights = np.asarray(model.get_weights()) model.save(checkpointpath1, include_optimizer=True, save_format='h5') # model_target.save(checkpointpath2, include_optimizer=True, save_format='h5') np.savez(outfile, frame_count=frame_count, epoch_loss_history=epoch_loss_history, training_end=training_end, epoch_count=epoch_count, loss=running_loss) np.save(outfile_models, model_weights) if federated: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "neighbors": args.N, "active_devices": args.Ka_consensus, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } elif parameter_server: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "active_devices": active_devices_per_round, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } else: dict_1 = { "epoch_loss_history": epoch_loss_history, "federated": federated, "parameter_server": parameter_server, "devices": devices, "batches": number_of_batches, "batch_size": batch_size, "samples": samples, "noniid": args.noniid_assignment, "data_distribution": args.random_data_distribution } if federated: sio.savemat( "results/matlab/CFA_device_{}_samples_{}_devices_{}_active_{}_neighbors_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(device_index, samples, devices, args.Ka_consensus, args.N, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) sio.savemat( "CFA_device_{}_samples_{}_devices_{}_neighbors_{}_batches_{}_size{}.mat" .format(device_index, samples, devices, args.N, number_of_batches, batch_size), dict_1) elif parameter_server: sio.savemat( "results/matlab/FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(device_index, samples, devices, active_devices_per_round, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) sio.savemat( "FA_device_{}_samples_{}_devices_{}_active_{}_batches_{}_size{}.mat" .format(device_index, samples, devices, active_devices_per_round, number_of_batches, batch_size), dict_1) else: # CL sio.savemat( "results/matlab/CL_samples_{}_devices_{}_batches_{}_size{}_noniid{}_run{}_distribution{}.mat" .format(samples, devices, number_of_batches, batch_size, args.noniid_assignment, args.run, args.random_data_distribution), dict_1) break
def train_step(self, input_seq, target_seq): """ Defines a backward pass through the network :param input_seq: :param target_seq: :return: """ # initialize loss loss = 0 time_steps = target_seq.shape[1] # initialize encoder hidden state enc_hidden = self.encoder.encoderA.initialize_hidden_state(self.encoder.encoderA.batch_size) with tf.GradientTape() as tape: # pass through encoder enc_output, enc_hidden = self.encoder(input_seq, enc_hidden, True) # input the hidden state dec_hidden = enc_hidden dec_input = tf.zeros(target_seq[:, 0].shape) # start teacher forcing the network for t in range(time_steps): # pass dec_input and target sequence to decoder prediction, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output, True) # calculate the loss for every time step losses = tf.keras.losses.MSE(target_seq[:, t], prediction) loss += tf.reduce_mean(losses) # purge the tensors from memory del dec_input, prediction # set the next target value as input to decoder dec_input = target_seq[:, t] # calculate average batch loss batch_loss = (loss / time_steps) # get trainable variables variables = self.encoder.trainable_variables # get the gradients gradients = tape.gradient(loss, variables) # purge tape from memory del tape # apply gradients to variables self.optimizer.apply_gradients(zip(gradients, variables)) loss_dict = { 'TEDM': { 'Reconstruction Loss': batch_loss } } return loss_dict