def log_run(split: str, epoch: int, writer: tf.summary.SummaryWriter, label_names: Sequence[str], metrics: MutableMapping[str, float], heaps: Mapping[str, Mapping[int, List[HeapItem]]], cm: np.ndarray) -> None: """Logs the outputs (metrics, confusion matrix, tp/fp/fn images) from a single epoch run to Tensorboard. Args: metrics: dict, keys already prefixed with {split}/ """ per_class_recall = recall_from_confusion_matrix(cm, label_names) metrics.update(prefix_all_keys(per_class_recall, f'{split}/label_recall/')) # log metrics for metric, value in metrics.items(): tf.summary.scalar(metric, value, epoch) # log confusion matrix cm_fig = plot_utils.plot_confusion_matrix(cm, classes=label_names, normalize=True) cm_fig_img = tf.convert_to_tensor(fig_to_img(cm_fig)[np.newaxis, ...]) tf.summary.image(f'confusion_matrix/{split}', cm_fig_img, step=epoch) # log tp/fp/fn images for heap_type, heap_dict in heaps.items(): log_images_with_confidence(heap_dict, label_names, epoch=epoch, tag=f'{split}/{heap_type}') writer.flush()
def trace_graph( writer: tf.summary.SummaryWriter) -> Generator[None, None, None]: """Context manager that traces the graph for a model constructed within it""" tf.summary.trace_on(graph=True, profiler=False) yield with writer.as_default(): tf.summary.trace_export(name="graph", step=0)
def plotProcessLoop(summaryWriter: tf.summary.SummaryWriter, drawHistogramProcess=None, iteration=0): global first item = QueueErlang.get() if item[0] == "histogram": [_, title, dataDict] = item if drawHistogramProcess is not None: drawHistogramProcess.terminate() # it can only be a living process from a previous iteration QueueData = Queue() drawHistogramProcess = Process(target=drawHistogramHelper, args=(QueueData,)) drawHistogramProcess.start() QueueData.put(title) QueueData.put(dataDict) plotProcessLoop(summaryWriter, drawHistogramProcess, iteration) # recursive call for further instructions if item[0] =="stopHistogram": print("got to terminate draw histogram") if drawHistogramProcess is not None: drawHistogramProcess.terminate() # it can only be a living process from a previous iteration else: print("From python: i shouldn't be here.....") plotProcessLoop(summaryWriter, None, iteration) if item[0] == "plot": [_, income, expence] = item with summaryWriter.as_default(): tf.summary.scalar('income', income, step=iteration) tf.summary.scalar('expence', expence, step=iteration) plotProcessLoop(summaryWriter, drawHistogramProcess, iteration + 1) if item[0] == "terminate": print("I got to terminate") if drawHistogramProcess is not None: drawHistogramProcess.terminate() # it can only be a living process from a previous iteration return else: plotProcessLoop(summaryWriter, drawHistogramProcess, iteration)
def trace_profile( writer: tf.summary.SummaryWriter) -> Generator[None, None, None]: """Context manager that profiles a model called within it""" logger.debug('Running profiler...') tf.summary.trace_on(graph=False, profiler=True) yield with writer.as_default(): tf.summary.trace_export(name="profile", step=0)
def create_summary(writer: tf.summary.SummaryWriter, optimizer_name: str, nb_img_utilisees, optimizer_parameters: Dict, loss: str, metriques_utilisees: List[str], but_essai: str, informations_additionnelles: str, id: str, dataset_name: str = "", taille_x_img: int = 1600, taille_y_img: int = 900, taille_x_img_redim: int = 400, taille_y_img_redim: int = 225, batch_size=10, nb_img_tot=173959, nb_epochs=1): markdown = f"""# Résumé de l'entrainement du {id} Entrainement sur {dataset_name} ({nb_img_tot} images ; {min(nb_img_utilisees,nb_img_tot)} utilisées) avec des images de taille {taille_x_img} px par {taille_y_img} px redimensionnées à {taille_x_img_redim} px x {taille_y_img_redim} px Batch size de {batch_size} ## Paramètres d'entrainement Entrainement sur {nb_epochs} epochs Optimisateur {optimizer_name} avec les paramètres :\n""" for k, v in optimizer_parameters.items(): markdown += f"{k} : {v}" markdown += f"""\nLoss : {loss} Métriques : """ markdown += ", ".join([f"{metrique}" for metrique in metriques_utilisees]) markdown += f"""\n## Description de l'essai\n\n{but_essai}\n\n{informations_additionnelles}""" with writer.as_default(): tf.summary.text("Resume", markdown, step=0) writer.flush()
def log_metrics(self, writer: tf.summary.SummaryWriter, dataset: tf.data.Dataset): data = next(iter(dataset)) loss, bboxes, scores, class_ids, valid_detections = self.validation( data['image'], data['label']) gt_boxes = data["bbox"] num_of_gt_boxes = data["num_of_bbox"] # calculate mAP for frame in zip(bboxes.numpy(), class_ids.numpy(), scores.numpy(), valid_detections.numpy(), gt_boxes.numpy(), num_of_gt_boxes.numpy()): pred_bbox, pred_cls, pred_score, valid_detection, gt_box, num_of_gt_box = frame # get all predicion and label pred_bbox = pred_bbox[:valid_detection] pred_cls = pred_cls[:valid_detection] pred_score = pred_score[:valid_detection] gt_box = gt_box[:num_of_gt_box] gt_bbox = gt_box[..., :4] gt_class_id = gt_box[..., 4] # frame = pred_bbox, pred_cls, pred_score, gt_bbox, gt_class_id self.mAP.evaluate(*frame) mean_average_precision = self.mAP.get_mAP() self.mAP.reset_accumulators() # plot image pred_image = self.plot_bounding_box(data['image'], bboxes, scores, class_ids, valid_detections) gt_image = self.plot_bounding_box(data['image'], gt_boxes[..., :4], tf.ones_like(scores), gt_boxes[..., 4], num_of_gt_boxes) # log tensorboard step = int(self.ckpt.step) with writer.as_default(): tf.summary.scalar("lr", self.optimizer.lr(step), step=step) tf.summary.scalar('loss', loss, step=step) tf.summary.scalar('mean loss', loss.numpy() / self.batch_size, step=step) tf.summary.scalar('[email protected]', mean_average_precision, step=step) tf.summary.image("Display pred bounding box", pred_image, step=step) tf.summary.image("Display gt bounding box", gt_image, step=step)
def write_net_weights(writer: tf.summary.SummaryWriter, namespace: str, net_name: str, val_list: list[array], epoch: int) -> None: """ TENSORBOARD METHOD: writes histograms of the nets weights """ W, B, names_layers = val_list[0::2], val_list[1::2], [ f'{net_name} L{i}' for i in range(len(val_list) // 2) ] assert len(names_layers) == len(W) == len(B) with writer.as_default(): for n, w, b in zip(names_layers, W, B): with tf.name_scope(f'{namespace}: Weights'): tf.summary.histogram(n, w, step=epoch) with tf.name_scope(f'{namespace}: Biases'): tf.summary.histogram(n, b, step=epoch)
def write_summary_scalar( metric_name: str, metric_value: Union[List[float], float], step: int, summary_writer: tf.summary.SummaryWriter, ): """Write a scalar summary statistic to a tensorboard directory.""" with summary_writer.as_default(): if isinstance(metric_value, list): value = metric_value[-1] tf.compat.v2.summary.scalar(name=metric_name, data=value, step=step) else: tf.compat.v2.summary.scalar(name=metric_name, data=metric_value, step=step)
def write_scalars(writer: tf.summary.SummaryWriter, metrics: dict[str, float], epoch: int) -> None: """ TENSORBOARD METHOD: writes scalars values of the metrics """ if not isinstance(metrics, dict): raise TypeError('type of param <metrics> must be dict') names = { 'Acc': 'Accuracy', 'Bacc': 'Balanced Accuracy', 'Ck': 'Cohen\'s Kappa', 'Js': 'Jaccard Score', 'Fs': 'F1-Score', 'Prec': 'Precision Score', 'Rec': 'Recall Score', 'Tpr': 'TPR', 'Tnr': 'TNR', 'Fpr': 'FPR', 'Fnr': 'FNR', 'Loss': 'Loss', 'It': 'Iteration @ Convergence' } namescopes = { **{i: 'Accuracy & Loss' for i in ['Acc', 'Bacc', 'It', 'Loss']}, **{ i: 'F-Score, Precision and Recall' for i in ['Fs', 'Prec', 'Rec'] }, **{ i: 'Positive and Negative Rates' for i in ['Tpr', 'Tnr', 'Fpr', 'Fnr'] }, **{i: 'Other Scores' for i in ['Ck', 'Js']} } with writer.as_default(): for i in metrics: name = names.get(i, i) with tf.name_scope(namescopes.get(i, 'Other Scores')): tf.summary.scalar(name, metrics[i], step=epoch, description=name)
def tensorboard_scalar(writer: tf.summary.SummaryWriter, name: str, data: float, step: int): with writer.as_default(): tf.summary.scalar(name, data, step)
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, graph_augmenter: augmentation_utils.GraphAugment): """Trains and evaluates the model.""" with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer) optimizer = tf.keras.optimizers.RMSprop( learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy( ) metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean( ) if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[ f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 if params.augmentations: # TODO(jihyeonlee): For now, choose 1 augmentation function from all # possible with equal probability. Allow user to specify number of # augmentations to apply per graph. features = graph_augmenter.augment(features) with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) @tf.function def eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state( labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))
def write_scalars(writer: tf.summary.SummaryWriter, scalar_dict, step): with writer.as_default(): for (k, v) in scalar_dict.items(): tf.summary.scalar(k, v, step=step) writer.flush()
def train(agent: Agent, env: ContinuousSimulation, log_every: int, test_every: int, target_network_update_freq: int, max_ts: int, writer: tf.summary.SummaryWriter = None, env_seeds: Iterator[int] = None, test: Callable[[Agent], None] = None, **kwargs) -> Agent: """ :param agent: Agent agent to be trained, modified in place :param env: ContinuousSimulation environment to train in :param log_every: int logs to either stdout or wandb every 'log_every' timesteps :param test_every: every 'test_every' timesteps, runs the test function on agent and env :param target_network_update_freq: int updates the target_network with this frequency (in timesteps) :param writer: tf.SummaryWriter writer to write logs with :param max_ts: int will stop training after max_ts timesteps :param env_seeds: seeds to seed environment with upon resets, in order doesn't set seed if not provided assumed that there are enough seeds to run max_ts timesteps :param (optional) test: Callable[Agent, None] uses this function to test. No testing done if not provided should probably be a lambda function of 'test' in this file :param kwargs: for compatibility :return: the Agent, after training """ episode_reward = 0 tf.summary.experimental.set_step(0) initial_log(agent, env, writer, **kwargs) if env_seeds is not None: env.seed(next(env_seeds)) writer = optional_writer(writer) state = env.reset() context = env.unwrapped.state() n_eps_completed: int = 0 for ts in range(1, max_ts + 1): action = agent.act(state, context, mode='train', network='q') next_state, reward, done, _ = env.step(int(action)) next_context = env.unwrapped.state() agent.remember(state, context, action, reward, next_state, next_context, done) state, context = next_state, next_context if done: n_eps_completed += 1 state = env.reset() context = env.unwrapped.state() if env_seeds is not None: env.seed(next(env_seeds)) if ts > 100: agent.optimize() agent.step(ts) with writer.as_default(): if ts % log_every == 0: tf.summary.experimental.set_step(ts) tf.summary.scalar('global timestep', ts) tf.summary.scalar('num updates', ts // target_network_update_freq) tf.summary.scalar('num episodes', n_eps_completed) agent.log(ts, writer) if ts % test_every == 0 and test is not None: test(agent) return agent
def main(strategy: tf.distribute.MirroredStrategy, global_step: tf.Tensor, train_writer: tf.summary.SummaryWriter, eval_writer: tf.summary.SummaryWriter, train_batch_size: int, eval_batch_size: int, job_dir: str, dataset_dir: str, dataset_filename: str, num_epochs: int, summary_steps: int, log_steps: int, dataset_spec: DatasetSpec, model: tf.keras.Model, loss_fn: tf.keras.losses.Loss, optimizer: tf.keras.optimizers.Optimizer): # Define metrics eval_metric = tf.keras.metrics.CategoricalAccuracy() best_metric = tf.Variable(eval_metric.result()) # Define training loop @distributed_run(strategy) def train_step(inputs): with tf.GradientTape() as tape: images, labels = inputs logits = model(images) cross_entropy = loss_fn(labels, logits) loss = tf.reduce_sum(cross_entropy) / train_batch_size gradients = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(gradients, model.variables)) if global_step % summary_steps == 0: tf.summary.scalar('loss', loss, step=global_step) return loss @distributed_run(strategy) def eval_step(inputs, metric): images, labels = inputs logits = model(images) metric.update_state(labels, logits) # Build input pipeline train_reader = Reader(dataset_dir, dataset_filename, split=Split.Train) test_reader = Reader(dataset_dir, dataset_filename, split=Split.Test) train_dataset = train_reader.read() test_dataset = test_reader.read() @unpack_dict def map_fn(_id, image, label): return tf.cast(image, tf.float32) / 255., label train_dataset = dataset_spec.parse(train_dataset).batch( train_batch_size).map(map_fn) test_dataset = dataset_spec.parse(test_dataset).batch(eval_batch_size).map( map_fn) ################# # Training loop # ################# # Define checkpoint checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model, global_step=global_step, best_metric=best_metric) # Restore the model checkpoint_dir = job_dir checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) # Prepare dataset for distributed run train_dataset = strategy.experimental_distribute_dataset(train_dataset) test_dataset = strategy.experimental_distribute_dataset(test_dataset) with CheckpointHandler(checkpoint, checkpoint_prefix): for epoch in range(num_epochs): print('---------- Epoch: {} ----------'.format(epoch + 1)) print('Starting training for epoch: {}'.format(epoch + 1)) with train_writer.as_default(): for inputs in tqdm(train_dataset, initial=global_step.numpy(), desc='Training', unit=' steps'): per_replica_losses = train_step(inputs) mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, None) if global_step.numpy() % log_steps == 0: print('Loss: {}'.format(mean_loss.numpy())) # Increment global step global_step.assign_add(1) print('Starting evaluation for epoch: {}'.format(epoch + 1)) with eval_writer.as_default(): for inputs in tqdm(test_dataset, desc='Evaluating'): eval_step(inputs, eval_metric) accuracy = eval_metric.result() print('Accuracy: {}'.format(accuracy.numpy())) tf.summary.scalar('accuracy', accuracy, step=global_step) if accuracy >= best_metric: checkpoint.save(file_prefix=checkpoint_prefix + '-best') print('The best model saved: {} is higher than {}'.format( accuracy.numpy(), best_metric.numpy())) best_metric.assign(accuracy) eval_metric.reset_states()
def diagnostic(agent: Agent, env: ContinuousSimulation, seeds: Iterator[int] = None, writer: tf.summary.SummaryWriter = None) -> None: """ :param agent: Agent Agent to evaluate (will not be changed) :param env: ContinuousSimulation environment to test in :param seeds: seeds to reset environment with, tests will be evaluated on these seeds defaults to range(0, 10) :param writer: tf.summary.SummaryWriter writer to use as context. Writes no-ops if None :return: all results logged to either wandb or console """ if seeds is None: seeds = iter(range(0, 10)) writer = optional_writer(writer) seed = next(seeds, None) rewards: List[float] = [] losses: List[float] = [] replay_buffer = ReplayBuffer(100000, seed) summaries: List[Dict] = [] while seed is not None: env.seed(seed) state = env.reset(logging=True) context = env.unwrapped.state() episode_reward: float = 0 done = False while not done: action = agent.act(state, 0) next_state, reward, done, _ = env.step(int(action)) next_context = env.unwrapped.state() episode_reward += reward replay_buffer.push(state, context, action, reward, next_state, next_context, done) state, context = next_state, next_context rewards.append(episode_reward) if hasattr(agent, 'compute_td_loss'): loss = agent.compute_td_loss(*replay_buffer.sample(50)) losses.append(loss.numpy()) seed = next(seeds, None) summaries.append(env.summary()) agg_summary = {} for key in summaries[0]: if isinstance(summaries[0][key], Number): # dtype is a number agg_summary[key] = list(map(lambda s: s[key], summaries)) else: agg_summary[key] = flatmap(map(lambda s: s[key], summaries)) max_ts: int = max(map(lambda s: len(s['actual queries']), summaries)) for key in ['original queries', 'actual queries']: agg_summary[key] = np.zeros(max_ts) agg_summary['min ' + key] = np.full(max_ts, np.inf) agg_summary['max ' + key] = np.full(max_ts, -np.inf) for s in summaries: for t in range(max_ts): agg_summary[key][t] += s[key][t] agg_summary['max ' + key][t] = max( agg_summary['max ' + key][t], s[key][t]) agg_summary['min ' + key][t] = min( agg_summary['min ' + key][t], s[key][t]) # TODO: this assumes that all stations are ordered in the same order, which may not be correct max_stations: int = max(map(lambda s: s['n_stations'], summaries)) agg_summary['recommendation freq'] = np.zeros(max_stations) for summ in summaries: agg_summary['recommendation freq'][0:summ['n_stations']] += summ[ 'recommendation freq'] # these values count the number of queries per timestep summed over all episodes histograms: List[str] = [ 'distances travelled', 'timesteps travelled', 'nearest distances' ] dist_histogram = np.histogram(agg_summary['distances travelled'], bins=20) near_histogram = np.histogram(agg_summary['nearest distances'], bins=20) exp_bins: np.ndarray = 2**np.arange(0, 10) exp_bins = np.insert(exp_bins, 0, 0, axis=0) # [0, 1, 2, 4, 8, 16, 32, 64, 128, 512] failed_histogram = np.histogram(agg_summary['failed dispatches'], bins=exp_bins) organic_histogram = np.histogram(agg_summary['organic fails'], bins=exp_bins) time_histogram = np.histogram(agg_summary['timesteps travelled'], bins=np.arange(max_ts + 1)) with writer.as_default(): tf.summary.histogram('distances travelled', agg_summary['distances travelled']) tf.summary.histogram('nearest distances', agg_summary['nearest distances']) tf.summary.histogram('timesteps travelled', agg_summary['timesteps travelled']) tf.summary.histogram('failed dispatches', agg_summary['failed dispatches']) tf.summary.histogram('organic fails', agg_summary['organic fails']) tf.summary.scalar('Reward', np.mean(rewards)) if hasattr(agent, 'compute_td_loss'): tf.summary.scalar('loss', np.mean(losses))
def run( train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, gp_layer_kwargs: Dict[str, Any], strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, use_spec_norm: bool, spec_norm_multiplier: float, use_spec_norm_mp: bool, spec_norm_multiplier_mp: float): """Trains and evaluates the model. Args: train_dataset: tf dataset that provides training data. eval_datasets: A dictionary of tf datasets that provides data for model evaluation. steps_per_eval: A dictionary of steps needed for each evaluation dataset. params: ModelParameters object containing MPNN model parameters. model_dir: Directory for files generated during training and evaluation. gp_layer_kwargs: A dictionary of parameters used for GP layer. strategy: tf Distributed training strategy object. summary_writer: tf summary writer to log training and evaluation metrics. loss_type: str, loss type to use during training. Currently only supports focal loss and cross-entropy loss. use_spec_norm: Whether to use Spectral normalization for the dense layer. spec_norm_multiplier: Multiplier used to control the magnitude of eigenvalue of the dense layer weight matrix. use_spec_norm_mp: Whether to use Spectral normalization for the MP layer. spec_norm_multiplier_mp: Multiplier used to control the magnitude of eigenvalue of the MP layer weight matrix. """ with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer, gp_layer_kwargs=gp_layer_kwargs, use_spec_norm=use_spec_norm, spec_norm_multiplier=spec_norm_multiplier, use_spec_norm_mp=use_spec_norm_mp, spec_norm_multiplier_mp=spec_norm_multiplier_mp) optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy() metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean() if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() def per_replica_train_step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) def per_replica_eval_step_fn(inputs, dataset_name): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) @tf.function def distributed_train_step(iterator): """Training StepFn.""" for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(per_replica_train_step_fn, args=(next(iterator),)) @tf.function def distributed_eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run( per_replica_eval_step_fn, args=(next(iterator), dataset_name)) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) distributed_train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) distributed_eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))
def run_dynamics( dynamics: GaugeDynamics, flags: dict[str, Any], writer: tf.summary.SummaryWriter = None, x: tf.Tensor = None, beta: float = None, save_x: bool = False, md_steps: int = 0, # window: int = 0, # should_track: bool = False, ) -> (InferenceResults): """Run inference on trained dynamics.""" if not IS_CHIEF: return InferenceResults(None, None, None, None, None) # -- Setup ----------------------------- print_steps = flags.get('print_steps', 5) if beta is None: beta = flags.get('beta', flags.get('beta_final', None)) # type: float if beta is None: logger.warning(f'beta unspecified! setting to 1') beta = 1. assert beta is not None and isinstance(beta, float) test_step = dynamics.test_step if flags.get('compile', True): test_step = tf.function(dynamics.test_step) io.log('Compiled `dynamics.test_step` using tf.function!') if x is None: x = tf.random.uniform(shape=dynamics.x_shape, *(-PI, PI)) # minval, maxval=PI, # dtype=TF_FLOAT) assert tf.is_tensor(x) run_steps = flags.get('run_steps', 20000) run_data = DataContainer(run_steps) template = '\n'.join([f'beta={beta}', f'net_weights={dynamics.net_weights}']) logger.info(f'Running inference with {template}') # Run `md_steps MD updates (w/o accept/reject) # to ensure chains don't get stuck if md_steps > 0: for _ in range(md_steps): mc_states, _ = dynamics.md_update((x, beta), training=False) x = mc_states.out.x try: x, metrics = test_step((x, tf.constant(beta))) except Exception as err: # pylint:disable=broad-except logger.warning(err) # io.log(f'Exception: {exception}') test_step = dynamics.test_step x, metrics = test_step((x, tf.constant(beta))) x_arr = [] def timed_step(x: tf.Tensor, beta: tf.Tensor): start = time.time() x, metrics = test_step((x, tf.constant(beta))) metrics.dt = time.time() - start if 'sin_charges' not in metrics: charges = dynamics.lattice.calc_both_charges(x=x) metrics['charges'] = charges.intQ metrics['sin_charges'] = charges.sinQ if save_x: x_arr.append(x.numpy()) return x, metrics summary_steps = max(run_steps // 100, 50) if writer is not None: writer.set_as_default() steps = tf.range(run_steps, dtype=tf.int64) keep_ = ['step', 'dt', 'loss', 'accept_prob', 'beta', 'dq_int', 'dq_sin', 'dQint', 'dQsin', 'plaqs', 'p4x4'] beta = tf.constant(beta, dtype=TF_FLOAT) # type: tf.Tensor data_strs = [] for idx, step in enumerate(steps): x, metrics = timed_step(x, beta) run_data.update(step, metrics) # update data after every accept/reject if step % summary_steps == 0: update_summaries(step, metrics, dynamics) # summarize_dict(metrics, step, prefix='testing') if step % print_steps == 0: pre = [f'{step}/{steps[-1]}'] ms = run_data.print_metrics(metrics, pre=pre, keep=keep_) data_strs.append(ms) return InferenceResults(dynamics=dynamics, x=x, x_arr=x_arr, run_data=run_data, data_strs=data_strs)
def tensorboard_histogram(writer: tf.summary.SummaryWriter, name: str, data: tf.Tensor, step: int): with writer.as_default(): tf.summary.histogram(name, data, step)
def save_dict_to_tensorboard(event_writer: tf.summary.SummaryWriter, dict: Dict, step: int): for key, val in dict.items(): with event_writer.as_default(): tf.summary.scalar(name=key, data=val, step=step) event_writer.flush()