def log_run(split: str, epoch: int, writer: tf.summary.SummaryWriter,
            label_names: Sequence[str], metrics: MutableMapping[str, float],
            heaps: Mapping[str,
                           Mapping[int,
                                   List[HeapItem]]], cm: np.ndarray) -> None:
    """Logs the outputs (metrics, confusion matrix, tp/fp/fn images) from a
    single epoch run to Tensorboard.

    Args:
        metrics: dict, keys already prefixed with {split}/
    """
    per_class_recall = recall_from_confusion_matrix(cm, label_names)
    metrics.update(prefix_all_keys(per_class_recall, f'{split}/label_recall/'))

    # log metrics
    for metric, value in metrics.items():
        tf.summary.scalar(metric, value, epoch)

    # log confusion matrix
    cm_fig = plot_utils.plot_confusion_matrix(cm,
                                              classes=label_names,
                                              normalize=True)
    cm_fig_img = tf.convert_to_tensor(fig_to_img(cm_fig)[np.newaxis, ...])
    tf.summary.image(f'confusion_matrix/{split}', cm_fig_img, step=epoch)

    # log tp/fp/fn images
    for heap_type, heap_dict in heaps.items():
        log_images_with_confidence(heap_dict,
                                   label_names,
                                   epoch=epoch,
                                   tag=f'{split}/{heap_type}')
    writer.flush()
Exemplo n.º 2
0
def trace_graph(
        writer: tf.summary.SummaryWriter) -> Generator[None, None, None]:
    """Context manager that traces the graph for a model constructed within it"""
    tf.summary.trace_on(graph=True, profiler=False)
    yield
    with writer.as_default():
        tf.summary.trace_export(name="graph", step=0)
Exemplo n.º 3
0
def plotProcessLoop(summaryWriter: tf.summary.SummaryWriter, drawHistogramProcess=None, iteration=0):
    global first
    item = QueueErlang.get()
    if item[0] == "histogram":
            [_, title, dataDict] = item
            if drawHistogramProcess is not None:
                drawHistogramProcess.terminate()  # it can only be a living process from a previous iteration
            QueueData = Queue()
            drawHistogramProcess = Process(target=drawHistogramHelper, args=(QueueData,))
            drawHistogramProcess.start()
            QueueData.put(title)
            QueueData.put(dataDict)
            plotProcessLoop(summaryWriter, drawHistogramProcess, iteration)  # recursive call for further instructions
    if item[0] =="stopHistogram":
        print("got to terminate draw histogram")
        if drawHistogramProcess is not None:
            drawHistogramProcess.terminate()  # it can only be a living process from a previous iteration
        else:
            print("From python: i shouldn't be here.....")
        plotProcessLoop(summaryWriter, None, iteration)
    if item[0] == "plot":
        [_, income, expence] = item
        with summaryWriter.as_default():
            tf.summary.scalar('income', income, step=iteration)
            tf.summary.scalar('expence', expence, step=iteration)
        plotProcessLoop(summaryWriter, drawHistogramProcess, iteration + 1)
    if item[0] == "terminate":
        print("I got to terminate")
        if drawHistogramProcess is not None:
                drawHistogramProcess.terminate()  # it can only be a living process from a previous iteration
        return
    else:
        plotProcessLoop(summaryWriter, drawHistogramProcess, iteration)
Exemplo n.º 4
0
def trace_profile(
        writer: tf.summary.SummaryWriter) -> Generator[None, None, None]:
    """Context manager that profiles a model called within it"""
    logger.debug('Running profiler...')
    tf.summary.trace_on(graph=False, profiler=True)
    yield
    with writer.as_default():
        tf.summary.trace_export(name="profile", step=0)
Exemplo n.º 5
0
def create_summary(writer: tf.summary.SummaryWriter,
                   optimizer_name: str,
                   nb_img_utilisees,
                   optimizer_parameters: Dict,
                   loss: str,
                   metriques_utilisees: List[str],
                   but_essai: str,
                   informations_additionnelles: str,
                   id: str,
                   dataset_name: str = "",
                   taille_x_img: int = 1600,
                   taille_y_img: int = 900,
                   taille_x_img_redim: int = 400,
                   taille_y_img_redim: int = 225,
                   batch_size=10,
                   nb_img_tot=173959,
                   nb_epochs=1):
    markdown = f"""# Résumé de l'entrainement du {id}

Entrainement sur {dataset_name} ({nb_img_tot} images ; {min(nb_img_utilisees,nb_img_tot)} utilisées) avec des images de taille {taille_x_img} px par {taille_y_img} px redimensionnées à {taille_x_img_redim} px x {taille_y_img_redim} px
Batch size de {batch_size}


## Paramètres d'entrainement

Entrainement sur {nb_epochs} epochs

Optimisateur {optimizer_name} avec les paramètres :\n"""
    for k, v in optimizer_parameters.items():
        markdown += f"{k} : {v}"
    markdown += f"""\nLoss : {loss}

Métriques : """
    markdown += ", ".join([f"{metrique}" for metrique in metriques_utilisees])
    markdown += f"""\n## Description de l'essai\n\n{but_essai}\n\n{informations_additionnelles}"""

    with writer.as_default():
        tf.summary.text("Resume", markdown, step=0)
        writer.flush()
Exemplo n.º 6
0
    def log_metrics(self, writer: tf.summary.SummaryWriter,
                    dataset: tf.data.Dataset):
        data = next(iter(dataset))
        loss, bboxes, scores, class_ids, valid_detections = self.validation(
            data['image'], data['label'])

        gt_boxes = data["bbox"]
        num_of_gt_boxes = data["num_of_bbox"]

        # calculate mAP
        for frame in zip(bboxes.numpy(), class_ids.numpy(), scores.numpy(),
                         valid_detections.numpy(), gt_boxes.numpy(),
                         num_of_gt_boxes.numpy()):
            pred_bbox, pred_cls, pred_score, valid_detection, gt_box, num_of_gt_box = frame

            # get all predicion and label
            pred_bbox = pred_bbox[:valid_detection]
            pred_cls = pred_cls[:valid_detection]
            pred_score = pred_score[:valid_detection]
            gt_box = gt_box[:num_of_gt_box]
            gt_bbox = gt_box[..., :4]
            gt_class_id = gt_box[..., 4]

            #
            frame = pred_bbox, pred_cls, pred_score, gt_bbox, gt_class_id
            self.mAP.evaluate(*frame)

        mean_average_precision = self.mAP.get_mAP()
        self.mAP.reset_accumulators()

        # plot image
        pred_image = self.plot_bounding_box(data['image'], bboxes, scores,
                                            class_ids, valid_detections)
        gt_image = self.plot_bounding_box(data['image'], gt_boxes[..., :4],
                                          tf.ones_like(scores),
                                          gt_boxes[..., 4], num_of_gt_boxes)

        # log tensorboard
        step = int(self.ckpt.step)
        with writer.as_default():
            tf.summary.scalar("lr", self.optimizer.lr(step), step=step)
            tf.summary.scalar('loss', loss, step=step)
            tf.summary.scalar('mean loss',
                              loss.numpy() / self.batch_size,
                              step=step)
            tf.summary.scalar('[email protected]', mean_average_precision, step=step)
            tf.summary.image("Display pred bounding box",
                             pred_image,
                             step=step)
            tf.summary.image("Display gt bounding box", gt_image, step=step)
Exemplo n.º 7
0
    def write_net_weights(writer: tf.summary.SummaryWriter, namespace: str,
                          net_name: str, val_list: list[array],
                          epoch: int) -> None:
        """ TENSORBOARD METHOD: writes histograms of the nets weights """
        W, B, names_layers = val_list[0::2], val_list[1::2], [
            f'{net_name} L{i}' for i in range(len(val_list) // 2)
        ]
        assert len(names_layers) == len(W) == len(B)

        with writer.as_default():
            for n, w, b in zip(names_layers, W, B):
                with tf.name_scope(f'{namespace}: Weights'):
                    tf.summary.histogram(n, w, step=epoch)
                with tf.name_scope(f'{namespace}: Biases'):
                    tf.summary.histogram(n, b, step=epoch)
Exemplo n.º 8
0
 def write_summary_scalar(
     metric_name: str,
     metric_value: Union[List[float], float],
     step: int,
     summary_writer: tf.summary.SummaryWriter,
 ):
     """Write a scalar summary statistic to a tensorboard directory."""
     with summary_writer.as_default():
         if isinstance(metric_value, list):
             value = metric_value[-1]
             tf.compat.v2.summary.scalar(name=metric_name,
                                         data=value,
                                         step=step)
         else:
             tf.compat.v2.summary.scalar(name=metric_name,
                                         data=metric_value,
                                         step=step)
Exemplo n.º 9
0
    def write_scalars(writer: tf.summary.SummaryWriter,
                      metrics: dict[str, float], epoch: int) -> None:
        """ TENSORBOARD METHOD: writes scalars values of the metrics """
        if not isinstance(metrics, dict):
            raise TypeError('type of param <metrics> must be dict')
        names = {
            'Acc': 'Accuracy',
            'Bacc': 'Balanced Accuracy',
            'Ck': 'Cohen\'s Kappa',
            'Js': 'Jaccard Score',
            'Fs': 'F1-Score',
            'Prec': 'Precision Score',
            'Rec': 'Recall Score',
            'Tpr': 'TPR',
            'Tnr': 'TNR',
            'Fpr': 'FPR',
            'Fnr': 'FNR',
            'Loss': 'Loss',
            'It': 'Iteration @ Convergence'
        }

        namescopes = {
            **{i: 'Accuracy & Loss'
               for i in ['Acc', 'Bacc', 'It', 'Loss']},
            **{
                i: 'F-Score, Precision and Recall'
                for i in ['Fs', 'Prec', 'Rec']
            },
            **{
                i: 'Positive and Negative Rates'
                for i in ['Tpr', 'Tnr', 'Fpr', 'Fnr']
            },
            **{i: 'Other Scores'
               for i in ['Ck', 'Js']}
        }

        with writer.as_default():
            for i in metrics:
                name = names.get(i, i)
                with tf.name_scope(namescopes.get(i, 'Other Scores')):
                    tf.summary.scalar(name,
                                      metrics[i],
                                      step=epoch,
                                      description=name)
Exemplo n.º 10
0
def tensorboard_scalar(writer: tf.summary.SummaryWriter, name: str,
                       data: float, step: int):
    with writer.as_default():
        tf.summary.scalar(name, data, step)
Exemplo n.º 11
0
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str,
                                                            tf.data.Dataset],
        steps_per_eval: Dict[str, int], params: utils.ModelParameters,
        model_dir: str, strategy: tf.distribute.Strategy,
        summary_writer: tf.summary.SummaryWriter, loss_type: str,
        graph_augmenter: augmentation_utils.GraphAugment):
    """Trains and evaluates the model."""
    with strategy.scope():
        model = ub.models.mpnn(
            nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
            edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
            num_heads=params.num_heads,
            num_layers=params.num_layers,
            message_layer_size=params.message_layer_size,
            readout_layer_size=params.readout_layer_size,
            use_gp_layer=params.use_gp_layer)
        optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=params.learning_rate)
        metrics = {
            'train/negative_log_likelihood': tf.keras.metrics.Mean(),
            'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'train/loss': tf.keras.metrics.Mean(),
            'train/roc_auc': tf.keras.metrics.AUC(),
        }

        for dataset_name in eval_datasets:
            metrics[
                f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy(
                )
            metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
            metrics[
                f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean(
                )
            if dataset_name == 'test2':
                ece_num_bins = 5
            else:
                ece_num_bins = 10
            metrics[
                f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
                    num_bins=ece_num_bins)
            metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, sample_weights = inputs
            else:
                features, labels = inputs
                sample_weights = 1

            if params.augmentations:
                # TODO(jihyeonlee): For now, choose 1 augmentation function from all
                # possible with equal probability. Allow user to specify number of
                # augmentations to apply per graph.
                features = graph_augmenter.augment(features)

            with tf.GradientTape() as tape:
                probs = model(features, training=True)
                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.categorical_crossentropy(labels, probs) *
                    sample_weights)

                l2_loss = sum(model.losses)
                if loss_type == 'focal':
                    focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
                    focal_loss = tf.reduce_mean(
                        focal_loss_fn(labels, probs) * sample_weights)
                    loss = focal_loss + l2_loss
                else:
                    loss = negative_log_likelihood + l2_loss
                # Scale the loss given the tf.distribute.Strategy will reduce sum all
                # gradients. See details in
                # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, probs)
            metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

        for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    @tf.function
    def eval_step(iterator, dataset_name, num_steps):
        """Evaluation StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            if len(inputs) == 3:
                features, labels, _ = inputs
            else:
                features, labels = inputs

            probs = model(features, training=False)
            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.categorical_crossentropy(labels, probs))

            metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
            metrics[f'{dataset_name}/roc_auc'].update_state(
                labels[:, 1], probs[:, 1])
            metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1],
                                                     label=labels[:, 1])
            metrics[f'{dataset_name}/brier'].add_batch(probs,
                                                       label=labels[:, 1])

        for _ in tf.range(tf.cast(num_steps, tf.int32)):
            strategy.run(step_fn, args=(next(iterator), ))

    # Makes datasets into distributed version.
    train_dataset = strategy.experimental_distribute_dataset(train_dataset)
    eval_datasets = {
        ds_name: strategy.experimental_distribute_dataset(ds)
        for ds_name, ds in eval_datasets.items()
    }
    logging.info('Number of replicas in sync: %s',
                 strategy.num_replicas_in_sync)

    train_iterator = iter(train_dataset)
    start_time = time.time()
    metrics_history = collections.defaultdict(list)
    for epoch in range(params.num_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        train_step(train_iterator)

        current_step = (epoch + 1) * params.steps_per_epoch
        max_steps = params.steps_per_epoch * params.num_epochs
        time_elapsed = time.time() - start_time
        steps_per_sec = float(current_step) / time_elapsed
        eta_seconds = (max_steps - current_step) / steps_per_sec
        message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                   'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                       current_step / max_steps, epoch + 1, params.num_epochs,
                       steps_per_sec, eta_seconds / 60, time_elapsed / 60))
        logging.info(message)

        # Start evaluation.
        logging.info('Starting to run eval at epoch: %s', epoch)
        for dataset_name, eval_dataset in eval_datasets.items():
            eval_iterator = iter(eval_dataset)
            eval_step(eval_iterator, dataset_name,
                      steps_per_eval[dataset_name])

        metrics_history['epoch'].append(epoch + 1)
        with summary_writer.as_default():
            for name, metric in metrics.items():
                result = utils.get_metric_result_value(metric)
                tf.summary.scalar(name, result, step=epoch + 1)
                metrics_history[name].append(str(result))

        for metric in metrics.values():
            metric.reset_states()

        model.save(os.path.join(model_dir, f'model_{epoch + 1}'),
                   overwrite=True)

    utils.write_params(metrics_history,
                       os.path.join(model_dir, 'metrics_history.json'))
Exemplo n.º 12
0
def write_scalars(writer: tf.summary.SummaryWriter, scalar_dict, step):
    with writer.as_default():
        for (k, v) in scalar_dict.items():
            tf.summary.scalar(k, v, step=step)
    writer.flush()
Exemplo n.º 13
0
def train(agent: Agent,
          env: ContinuousSimulation,
          log_every: int,
          test_every: int,
          target_network_update_freq: int,
          max_ts: int,
          writer: tf.summary.SummaryWriter = None,
          env_seeds: Iterator[int] = None,
          test: Callable[[Agent], None] = None,
          **kwargs) -> Agent:
    """

    :param agent: Agent
        agent to be trained, modified in place
    :param env: ContinuousSimulation
        environment to train in
    :param log_every: int
        logs to either stdout or wandb every 'log_every' timesteps
    :param test_every:
        every 'test_every' timesteps, runs the test function on agent and env
    :param target_network_update_freq: int
        updates the target_network with this frequency (in timesteps)
    :param writer: tf.SummaryWriter
        writer to write logs with
    :param max_ts: int
        will stop training after max_ts timesteps
    :param env_seeds:
        seeds to seed environment with upon resets, in order
        doesn't set seed if not provided
        assumed that there are enough seeds to run max_ts timesteps
    :param (optional) test: Callable[Agent, None]
        uses this function to test. No testing done if not provided
        should probably be a lambda function of 'test' in this file
    :param kwargs: for compatibility
    :return: the Agent, after training
    """
    episode_reward = 0
    tf.summary.experimental.set_step(0)
    initial_log(agent, env, writer, **kwargs)

    if env_seeds is not None:
        env.seed(next(env_seeds))
    writer = optional_writer(writer)
    state = env.reset()
    context = env.unwrapped.state()

    n_eps_completed: int = 0
    for ts in range(1, max_ts + 1):
        action = agent.act(state, context, mode='train', network='q')

        next_state, reward, done, _ = env.step(int(action))
        next_context = env.unwrapped.state()

        agent.remember(state, context, action, reward, next_state,
                       next_context, done)

        state, context = next_state, next_context

        if done:
            n_eps_completed += 1
            state = env.reset()
            context = env.unwrapped.state()
            if env_seeds is not None:
                env.seed(next(env_seeds))

        if ts > 100:
            agent.optimize()
        agent.step(ts)

        with writer.as_default():
            if ts % log_every == 0:
                tf.summary.experimental.set_step(ts)
                tf.summary.scalar('global timestep', ts)
                tf.summary.scalar('num updates',
                                  ts // target_network_update_freq)
                tf.summary.scalar('num episodes', n_eps_completed)
                agent.log(ts, writer)

        if ts % test_every == 0 and test is not None:
            test(agent)

    return agent
Exemplo n.º 14
0
def main(strategy: tf.distribute.MirroredStrategy, global_step: tf.Tensor,
         train_writer: tf.summary.SummaryWriter,
         eval_writer: tf.summary.SummaryWriter, train_batch_size: int,
         eval_batch_size: int, job_dir: str, dataset_dir: str,
         dataset_filename: str, num_epochs: int, summary_steps: int,
         log_steps: int, dataset_spec: DatasetSpec, model: tf.keras.Model,
         loss_fn: tf.keras.losses.Loss,
         optimizer: tf.keras.optimizers.Optimizer):
    # Define metrics
    eval_metric = tf.keras.metrics.CategoricalAccuracy()
    best_metric = tf.Variable(eval_metric.result())

    # Define training loop

    @distributed_run(strategy)
    def train_step(inputs):
        with tf.GradientTape() as tape:
            images, labels = inputs

            logits = model(images)

            cross_entropy = loss_fn(labels, logits)
            loss = tf.reduce_sum(cross_entropy) / train_batch_size

            gradients = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(zip(gradients, model.variables))

            if global_step % summary_steps == 0:
                tf.summary.scalar('loss', loss, step=global_step)

            return loss

    @distributed_run(strategy)
    def eval_step(inputs, metric):
        images, labels = inputs

        logits = model(images)

        metric.update_state(labels, logits)

    # Build input pipeline
    train_reader = Reader(dataset_dir, dataset_filename, split=Split.Train)
    test_reader = Reader(dataset_dir, dataset_filename, split=Split.Test)
    train_dataset = train_reader.read()
    test_dataset = test_reader.read()

    @unpack_dict
    def map_fn(_id, image, label):
        return tf.cast(image, tf.float32) / 255., label

    train_dataset = dataset_spec.parse(train_dataset).batch(
        train_batch_size).map(map_fn)
    test_dataset = dataset_spec.parse(test_dataset).batch(eval_batch_size).map(
        map_fn)

    #################
    # Training loop #
    #################
    # Define checkpoint
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     model=model,
                                     global_step=global_step,
                                     best_metric=best_metric)
    # Restore the model
    checkpoint_dir = job_dir
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # Prepare dataset for distributed run
    train_dataset = strategy.experimental_distribute_dataset(train_dataset)
    test_dataset = strategy.experimental_distribute_dataset(test_dataset)

    with CheckpointHandler(checkpoint, checkpoint_prefix):
        for epoch in range(num_epochs):
            print('---------- Epoch: {} ----------'.format(epoch + 1))

            print('Starting training for epoch: {}'.format(epoch + 1))
            with train_writer.as_default():
                for inputs in tqdm(train_dataset,
                                   initial=global_step.numpy(),
                                   desc='Training',
                                   unit=' steps'):
                    per_replica_losses = train_step(inputs)
                    mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                                per_replica_losses, None)

                    if global_step.numpy() % log_steps == 0:
                        print('Loss: {}'.format(mean_loss.numpy()))

                    # Increment global step
                    global_step.assign_add(1)

            print('Starting evaluation for epoch: {}'.format(epoch + 1))

            with eval_writer.as_default():
                for inputs in tqdm(test_dataset, desc='Evaluating'):
                    eval_step(inputs, eval_metric)

                accuracy = eval_metric.result()
                print('Accuracy: {}'.format(accuracy.numpy()))
                tf.summary.scalar('accuracy', accuracy, step=global_step)

                if accuracy >= best_metric:
                    checkpoint.save(file_prefix=checkpoint_prefix + '-best')
                    print('The best model saved: {} is higher than {}'.format(
                        accuracy.numpy(), best_metric.numpy()))
                    best_metric.assign(accuracy)

            eval_metric.reset_states()
Exemplo n.º 15
0
def diagnostic(agent: Agent,
               env: ContinuousSimulation,
               seeds: Iterator[int] = None,
               writer: tf.summary.SummaryWriter = None) -> None:
    """

    :param agent: Agent
        Agent to evaluate (will not be changed)
    :param env: ContinuousSimulation
        environment to test in
    :param seeds:
        seeds to reset environment with, tests will be evaluated on these seeds
        defaults to range(0, 10)
    :param writer: tf.summary.SummaryWriter
        writer to use as context. Writes no-ops if None
    :return: all results logged to either wandb or console
    """
    if seeds is None:
        seeds = iter(range(0, 10))
    writer = optional_writer(writer)
    seed = next(seeds, None)

    rewards: List[float] = []
    losses: List[float] = []
    replay_buffer = ReplayBuffer(100000, seed)

    summaries: List[Dict] = []

    while seed is not None:
        env.seed(seed)
        state = env.reset(logging=True)
        context = env.unwrapped.state()
        episode_reward: float = 0
        done = False

        while not done:
            action = agent.act(state, 0)

            next_state, reward, done, _ = env.step(int(action))
            next_context = env.unwrapped.state()
            episode_reward += reward

            replay_buffer.push(state, context, action, reward, next_state,
                               next_context, done)

            state, context = next_state, next_context

        rewards.append(episode_reward)
        if hasattr(agent, 'compute_td_loss'):
            loss = agent.compute_td_loss(*replay_buffer.sample(50))
            losses.append(loss.numpy())

        seed = next(seeds, None)

        summaries.append(env.summary())

    agg_summary = {}
    for key in summaries[0]:
        if isinstance(summaries[0][key], Number):
            # dtype is a number
            agg_summary[key] = list(map(lambda s: s[key], summaries))
        else:
            agg_summary[key] = flatmap(map(lambda s: s[key], summaries))
    max_ts: int = max(map(lambda s: len(s['actual queries']), summaries))
    for key in ['original queries', 'actual queries']:
        agg_summary[key] = np.zeros(max_ts)
        agg_summary['min ' + key] = np.full(max_ts, np.inf)
        agg_summary['max ' + key] = np.full(max_ts, -np.inf)
        for s in summaries:
            for t in range(max_ts):
                agg_summary[key][t] += s[key][t]
                agg_summary['max ' + key][t] = max(
                    agg_summary['max ' + key][t], s[key][t])
                agg_summary['min ' + key][t] = min(
                    agg_summary['min ' + key][t], s[key][t])

    # TODO: this assumes that all stations are ordered in the same order, which may not be correct
    max_stations: int = max(map(lambda s: s['n_stations'], summaries))
    agg_summary['recommendation freq'] = np.zeros(max_stations)
    for summ in summaries:
        agg_summary['recommendation freq'][0:summ['n_stations']] += summ[
            'recommendation freq']

    # these values count the number of queries per timestep summed over all episodes
    histograms: List[str] = [
        'distances travelled', 'timesteps travelled', 'nearest distances'
    ]
    dist_histogram = np.histogram(agg_summary['distances travelled'], bins=20)
    near_histogram = np.histogram(agg_summary['nearest distances'], bins=20)

    exp_bins: np.ndarray = 2**np.arange(0, 10)
    exp_bins = np.insert(exp_bins, 0, 0, axis=0)
    # [0, 1, 2, 4, 8, 16, 32, 64, 128, 512]
    failed_histogram = np.histogram(agg_summary['failed dispatches'],
                                    bins=exp_bins)
    organic_histogram = np.histogram(agg_summary['organic fails'],
                                     bins=exp_bins)
    time_histogram = np.histogram(agg_summary['timesteps travelled'],
                                  bins=np.arange(max_ts + 1))

    with writer.as_default():
        tf.summary.histogram('distances travelled',
                             agg_summary['distances travelled'])
        tf.summary.histogram('nearest distances',
                             agg_summary['nearest distances'])
        tf.summary.histogram('timesteps travelled',
                             agg_summary['timesteps travelled'])
        tf.summary.histogram('failed dispatches',
                             agg_summary['failed dispatches'])
        tf.summary.histogram('organic fails', agg_summary['organic fails'])
        tf.summary.scalar('Reward', np.mean(rewards))
        if hasattr(agent, 'compute_td_loss'):
            tf.summary.scalar('loss', np.mean(losses))
Exemplo n.º 16
0
def run(
    train_dataset: tf.data.Dataset,
    eval_datasets: Dict[str, tf.data.Dataset],
    steps_per_eval: Dict[str, int],
    params: utils.ModelParameters,
    model_dir: str,
    gp_layer_kwargs: Dict[str, Any],
    strategy: tf.distribute.Strategy,
    summary_writer: tf.summary.SummaryWriter,
    loss_type: str,
    use_spec_norm: bool,
    spec_norm_multiplier: float,
    use_spec_norm_mp: bool,
    spec_norm_multiplier_mp: float):
  """Trains and evaluates the model.

  Args:
    train_dataset: tf dataset that provides training data.
    eval_datasets: A dictionary of tf datasets that provides data for model
      evaluation.
    steps_per_eval: A dictionary of steps needed for each evaluation dataset.
    params: ModelParameters object containing MPNN model parameters.
    model_dir: Directory for files generated during training and evaluation.
    gp_layer_kwargs: A dictionary of parameters used for GP layer.
    strategy: tf Distributed training strategy object.
    summary_writer: tf summary writer to log training and evaluation metrics.
    loss_type: str, loss type to use during training. Currently only
      supports focal loss and cross-entropy loss.
    use_spec_norm: Whether to use Spectral normalization for the dense layer.
    spec_norm_multiplier: Multiplier used to control the magnitude of
      eigenvalue of the dense layer weight matrix.
    use_spec_norm_mp: Whether to use Spectral normalization for the MP layer.
    spec_norm_multiplier_mp: Multiplier used to control the magnitude of
      eigenvalue of the MP layer weight matrix.

  """
  with strategy.scope():
    model = ub.models.mpnn(
        nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:],
        edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:],
        num_heads=params.num_heads,
        num_layers=params.num_layers,
        message_layer_size=params.message_layer_size,
        readout_layer_size=params.readout_layer_size,
        use_gp_layer=params.use_gp_layer,
        gp_layer_kwargs=gp_layer_kwargs,
        use_spec_norm=use_spec_norm,
        spec_norm_multiplier=spec_norm_multiplier,
        use_spec_norm_mp=use_spec_norm_mp,
        spec_norm_multiplier_mp=spec_norm_multiplier_mp)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate)
    metrics = {
        'train/negative_log_likelihood': tf.keras.metrics.Mean(),
        'train/accuracy': tf.keras.metrics.CategoricalAccuracy(),
        'train/loss': tf.keras.metrics.Mean(),
        'train/roc_auc': tf.keras.metrics.AUC(),
    }

    for dataset_name in eval_datasets:
      metrics[
          f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy()
      metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC()
      metrics[
          f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean()
      if dataset_name == 'test2':
        ece_num_bins = 5
      else:
        ece_num_bins = 10
      metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError(
          num_bins=ece_num_bins)
      metrics[f'{dataset_name}/brier'] = rm.metrics.Brier()

  def per_replica_train_step_fn(inputs):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, sample_weights = inputs
    else:
      features, labels = inputs
      sample_weights = 1

    with tf.GradientTape() as tape:
      probs = model(features, training=True)
      negative_log_likelihood = tf.reduce_mean(
          tf.keras.losses.categorical_crossentropy(labels, probs) *
          sample_weights)

      l2_loss = sum(model.losses)
      if loss_type == 'focal':
        focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy()
        focal_loss = tf.reduce_mean(
            focal_loss_fn(labels, probs) * sample_weights)
        loss = focal_loss + l2_loss
      else:
        loss = negative_log_likelihood + l2_loss
      # Scale the loss given the tf.distribute.Strategy will reduce sum all
      # gradients. See details in
      # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
      scaled_loss = loss / strategy.num_replicas_in_sync

    grads = tape.gradient(scaled_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    metrics['train/loss'].update_state(loss)
    metrics['train/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics['train/accuracy'].update_state(labels, probs)
    metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1])

  def per_replica_eval_step_fn(inputs, dataset_name):
    """Per-Replica StepFn."""
    if len(inputs) == 3:
      features, labels, _ = inputs
    else:
      features, labels = inputs

    probs = model(features, training=False)
    negative_log_likelihood = tf.reduce_mean(
        tf.keras.losses.categorical_crossentropy(labels, probs))

    metrics[f'{dataset_name}/negative_log_likelihood'].update_state(
        negative_log_likelihood)
    metrics[f'{dataset_name}/accuracy'].update_state(labels, probs)
    metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1])
    metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1])
    metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1])

  @tf.function
  def distributed_train_step(iterator):
    """Training StepFn."""
    for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)):
      strategy.run(per_replica_train_step_fn, args=(next(iterator),))

  @tf.function
  def distributed_eval_step(iterator, dataset_name, num_steps):
    """Evaluation StepFn."""
    for _ in tf.range(tf.cast(num_steps, tf.int32)):
      strategy.run(
          per_replica_eval_step_fn, args=(next(iterator), dataset_name))

  # Makes datasets into distributed version.
  train_dataset = strategy.experimental_distribute_dataset(train_dataset)
  eval_datasets = {
      ds_name: strategy.experimental_distribute_dataset(ds)
      for ds_name, ds in eval_datasets.items()
  }
  logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync)

  train_iterator = iter(train_dataset)
  start_time = time.time()
  metrics_history = collections.defaultdict(list)
  for epoch in range(params.num_epochs):
    logging.info('Starting to run epoch: %s', epoch)
    distributed_train_step(train_iterator)

    current_step = (epoch + 1) * params.steps_per_epoch
    max_steps = params.steps_per_epoch * params.num_epochs
    time_elapsed = time.time() - start_time
    steps_per_sec = float(current_step) / time_elapsed
    eta_seconds = (max_steps - current_step) / steps_per_sec
    message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
               'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                   current_step / max_steps, epoch + 1, params.num_epochs,
                   steps_per_sec, eta_seconds / 60, time_elapsed / 60))
    logging.info(message)

    # Start evaluation.
    logging.info('Starting to run eval at epoch: %s', epoch)
    for dataset_name, eval_dataset in eval_datasets.items():
      eval_iterator = iter(eval_dataset)
      distributed_eval_step(eval_iterator, dataset_name,
                            steps_per_eval[dataset_name])

    metrics_history['epoch'].append(epoch + 1)
    with summary_writer.as_default():
      for name, metric in metrics.items():
        result = utils.get_metric_result_value(metric)
        tf.summary.scalar(name, result, step=epoch + 1)
        metrics_history[name].append(str(result))


    for metric in metrics.values():
      metric.reset_states()

    model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True)

  utils.write_params(metrics_history,
                     os.path.join(model_dir, 'metrics_history.json'))
Exemplo n.º 17
0
def run_dynamics(
        dynamics: GaugeDynamics,
        flags: dict[str, Any],
        writer: tf.summary.SummaryWriter = None,
        x: tf.Tensor = None,
        beta: float = None,
        save_x: bool = False,
        md_steps: int = 0,
        # window: int = 0,
        #  should_track: bool = False,
) -> (InferenceResults):
    """Run inference on trained dynamics."""
    if not IS_CHIEF:
        return InferenceResults(None, None, None, None, None)

    # -- Setup -----------------------------
    print_steps = flags.get('print_steps', 5)
    if beta is None:
        beta = flags.get('beta', flags.get('beta_final', None))  # type: float
        if beta is None:
            logger.warning(f'beta unspecified! setting to 1')
            beta = 1.
        assert beta is not None and isinstance(beta, float)

    test_step = dynamics.test_step
    if flags.get('compile', True):
        test_step = tf.function(dynamics.test_step)
        io.log('Compiled `dynamics.test_step` using tf.function!')

    if x is None:
        x = tf.random.uniform(shape=dynamics.x_shape, *(-PI, PI))
                              # minval, maxval=PI,
                              # dtype=TF_FLOAT)
    assert tf.is_tensor(x)

    run_steps = flags.get('run_steps', 20000)
    run_data = DataContainer(run_steps)

    template = '\n'.join([f'beta={beta}',
                          f'net_weights={dynamics.net_weights}'])
    logger.info(f'Running inference with {template}')

    # Run `md_steps MD updates (w/o accept/reject)
    # to ensure chains don't get stuck
    if md_steps > 0:
        for _ in range(md_steps):
            mc_states, _ = dynamics.md_update((x, beta), training=False)
            x = mc_states.out.x

    try:
        x, metrics = test_step((x, tf.constant(beta)))
    except Exception as err:  # pylint:disable=broad-except
        logger.warning(err)
        #  io.log(f'Exception: {exception}')
        test_step = dynamics.test_step
        x, metrics = test_step((x, tf.constant(beta)))

    x_arr = []

    def timed_step(x: tf.Tensor, beta: tf.Tensor):
        start = time.time()
        x, metrics = test_step((x, tf.constant(beta)))
        metrics.dt = time.time() - start
        if 'sin_charges' not in metrics:
            charges = dynamics.lattice.calc_both_charges(x=x)
            metrics['charges'] = charges.intQ
            metrics['sin_charges'] = charges.sinQ
        if save_x:
            x_arr.append(x.numpy())

        return x, metrics

    summary_steps = max(run_steps // 100, 50)

    if writer is not None:
        writer.set_as_default()

    steps = tf.range(run_steps, dtype=tf.int64)
    keep_ = ['step', 'dt', 'loss', 'accept_prob', 'beta',
             'dq_int', 'dq_sin', 'dQint', 'dQsin', 'plaqs', 'p4x4']

    beta = tf.constant(beta, dtype=TF_FLOAT)  # type: tf.Tensor
    data_strs = []
    for idx, step in enumerate(steps):
        x, metrics = timed_step(x, beta)
        run_data.update(step, metrics)  # update data after every accept/reject

        if step % summary_steps == 0:
            update_summaries(step, metrics, dynamics)
            # summarize_dict(metrics, step, prefix='testing')

        if step % print_steps == 0:
            pre = [f'{step}/{steps[-1]}']
            ms = run_data.print_metrics(metrics,
                                        pre=pre, keep=keep_)
            data_strs.append(ms)

    return InferenceResults(dynamics=dynamics, x=x, x_arr=x_arr,
                            run_data=run_data, data_strs=data_strs)
Exemplo n.º 18
0
def tensorboard_histogram(writer: tf.summary.SummaryWriter, name: str,
                          data: tf.Tensor, step: int):
    with writer.as_default():
        tf.summary.histogram(name, data, step)
Exemplo n.º 19
0
def save_dict_to_tensorboard(event_writer: tf.summary.SummaryWriter,
                             dict: Dict, step: int):
    for key, val in dict.items():
        with event_writer.as_default():
            tf.summary.scalar(name=key, data=val, step=step)
    event_writer.flush()