Пример #1
0
                                    )

    valid_triplets_loader = DataLoader(weak_valid_dl_triplet, batch_size=batch_size, shuffle=False,
                                       num_workers=cfg.num_workers,
                                       drop_last=True, collate_fn=collate_fn)

    test_triplets_loader = DataLoader(test_triplets, batch_size=batch_size, shuffle=False,
                                      num_workers=cfg.num_workers,
                                      drop_last=True, collate_fn=collate_fn)

    # #########
    # # Model and optimizer
    # ########
    if resume_training is None:
        model_triplet, state = get_model(state, f_args)
        optimizer, state = get_optimizer(model_triplet, state)

    LOG.info(model_triplet)
    pytorch_total_params = sum(p.numel() for p in model_triplet.parameters() if p.requires_grad)
    LOG.info("number of parameters in the model: {}".format(pytorch_total_params))
    model_triplet.train()
    # scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5, verbose=True)
    LOG.info(optimizer)
    model_triplet = to_cuda_if_available(model_triplet)

    # ##########
    # # Callbacks
    # ##########
    if cfg.save_best:
        save_best_call = SaveBest(val_comp="sup")
    if cfg.early_stopping is not None:
Пример #2
0
    params_name = {
        "early_stopping": cfg.early_stopping,
        "conv_dropout": cfg.conv_dropout,
        "frames": cfg.frames_in_sec,
    }
    params_name.update(args.__dict__)

    base_model_name = get_model_name(params_name)
    # Model
    state = {
        "scaler": scaler.state_dict(),
        "many_hot_encoder": many_hot_encoder.state_dict(),
        "args": vars(args),
    }
    model, state = get_model(state, args)
    optimizer, state = get_optimizer(model, state)
    model = to_cuda_if_available(model)
    LOG.info(model)

    # ##########
    # # Callbacks
    # ##########
    if cfg.save_best:
        save_best_call = SaveBest(val_comp="sup")
    if cfg.early_stopping is not None:
        early_stopping_call = EarlyStopping(patience=cfg.early_stopping,
                                            val_comp="sup")
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

    # x, y = next(iter(train_loader))
    x, y = train_set[0]
Пример #3
0
    def _init_graph(self):
        '''
        Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
        '''
        self.graph = tf.Graph()
        with self.graph.as_default():  # , tf.device('/cpu:0'):
            # Set graph level random seed
            tf.set_random_seed(self.random_seed)
            np.random.seed(self.random_seed)
            # Input data.
            if self.is_lookup:
                self.train_features = tf.placeholder(
                    tf.int32, shape=[None,
                                     self.num_field])  # None * num_features
            elif self.is_sparse:
                self.train_features = tf.sparse_placeholder(
                    tf.float32,
                    shape=[None, self.num_features])  # None * num_features
            else:
                self.train_features = tf.placeholder(
                    tf.float32,
                    shape=[None, self.num_features])  # None * num_features
            self.train_labels = tf.placeholder(tf.float32,
                                               shape=[None, 1])  # None * 1

            # Variables.
            self.weights = self._initialize_weights()
            self.weights_feature = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, 'feature')

            # Model.

            ###################################################################################
            # bilinear embedding
            self.sample_embedding = []  # the embeddings of each FM
            self.sample_flag = [
            ]  # flags of whether a specific column exists in each FM.
            self.sample_embedding.append(
                tf.nn.embedding_lookup(self.weights['feature_bilinear_0'],
                                       self.train_features))
            # the first FM contains all nonzero columns
            self.sample_flag.append(tf.ones([self.num_field],
                                            dtype=tf.float32))
            self.sample_flag[0] = self.sample_flag[0][tf.newaxis, :,
                                                      tf.newaxis]

            # setting flags and weights for each FM
            for k in range(self.feature_table.shape[1]):
                # the k+1-th FM
                tmp_zero = tf.zeros([1, self.embedding_dim[k + 1]],
                                    dtype=tf.float32)
                cur_weight_table = tf.concat(
                    (tmp_zero, self.weights['feature_bilinear_%d' % (k + 1)]),
                    axis=0)
                cur_ids = tf.nn.embedding_lookup(self.feature_table[:, k],
                                                 self.train_features)
                self.sample_embedding.append(
                    tf.nn.embedding_lookup(cur_weight_table, cur_ids))
                self.sample_flag.append(
                    tf.nn.embedding_lookup(self.feature_flag[:, k],
                                           self.train_features))
                self.sample_flag[-1] = self.sample_flag[-1][:, :, tf.newaxis]

            self.bilinear = []  # the bilinear parts of each FM

            # core of RaFM: multiple embeddings
            base = tf.zeros_like(self.train_labels, dtype=np.float32)
            for k in range(self.feature_table.shape[1]):
                free_part = self.sample_embedding[k] * (
                    self.sample_flag[k] - self.sample_flag[k + 1])
                dependent_part = self.sample_embedding[k] * self.sample_flag[
                    k + 1]
                # Note the stop_gradient here!
                low_output = common.get_bilinear_embedding_from_feature(
                    tf.stop_gradient(free_part) + dependent_part)
                low_output = tf.reduce_sum(low_output, axis=1, keep_dims=True)
                self.bilinear.append(
                    tf.add_n([tf.stop_gradient(base), low_output]))
                low_interaction = common.get_bilinear_embedding_from_feature(
                    dependent_part + free_part)
                low_interaction = tf.reduce_sum(low_interaction,
                                                axis=1,
                                                keep_dims=True)
                correction = common.get_bilinear_embedding_from_feature(
                    dependent_part)
                correction = tf.reduce_sum(correction, axis=1, keep_dims=True)
                base = tf.add_n([base, -correction, low_interaction])

            final_high_interaction = common.get_bilinear_embedding_from_feature(
                self.sample_embedding[-1])
            final_high_interaction = tf.reduce_sum(final_high_interaction,
                                                   axis=1,
                                                   keep_dims=True)
            self.bilinear.append(tf.add_n([base, final_high_interaction]))

            # linear embedding
            self.weights_linear_reshape = self.weights['feature_linear']
            self.linear = common.get_linear_embedding(
                self.train_features, self.weights_linear_reshape,
                self.is_sparse, True)
            self.linear = self.linear[:, tf.newaxis]

            # bias
            self.weights_bias_reshape = self.weights['bias']
            self.bias = tf.ones_like(
                self.train_labels,
                dtype=np.float32) * self.weights_bias_reshape

            # out[k]: \mathcal{B}_{1, k+1} in our paper
            self.out = []
            for k in range(self.feature_table.shape[1] + 1):
                self.out.append(
                    tf.add_n([self.bilinear[k], self.linear, self.bias]))

            # The loss function, which uses different update rules for free variables and dependent variables
            self.loss = 0
            if self.loss_type == 'square_loss':
                # free variables
                self.loss += tf.nn.l2_loss(
                    tf.subtract(self.train_labels, self.out[-1]))
                # loss of dependent variables. We use stop_gradient to mimic the update rule of dependent parts
                for k in range(self.feature_table.shape[1]):
                    self.loss += self.dependent_lr_coef[k] * tf.nn.l2_loss(
                        tf.subtract(self.out[k],
                                    tf.stop_gradient(self.out[k + 1])))
            elif self.loss_type == 'log_loss':
                for k in range(len(self.out)):
                    self.out[k] = tf.sigmoid(self.out[k])
                # free variables
                self.loss += tf.losses.log_loss(self.train_labels,
                                                self.out[-1],
                                                weights=1.0,
                                                epsilon=1e-07,
                                                scope=None)
                # loss of dependent variables. We use stop_gradient to mimic the update rule of dependent parts
                for k in range(self.feature_table.shape[1]):
                    loss = self.dependent_lr_coef[k] * tf.losses.log_loss(
                        tf.stop_gradient(self.out[k + 1]),
                        self.out[k],
                        weights=1.0,
                        epsilon=1e-07,
                        scope=None)
                    self.loss += loss

            self.reg_loss = 0  # L2 regularization of each embedding
            for k in range(self.feature_table.shape[1] + 1):
                if self.lambda_bilinear[k] > 0:
                    self.reg_loss += tf.contrib.layers.l2_regularizer(
                        self.lambda_bilinear[k])(
                            self.weights['feature_bilinear_%d' % k])

            self.loss += self.reg_loss

            self.optimizer = common.get_optimizer(self.optimizer_type,
                                                  self.learning_rate,
                                                  self.loss, None)

            # init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = tf.Session()
            self.sess.run(init)
            if self.is_continuous == 1:
                self.saver.restore(self.sess, self.save_file + self.suffix)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
def main():
    usage = "%prog project documents.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=0.00001,
                      help='Regularization strength: default=%default')
    parser.add_option('-d', dest='hidden_dim', default=50,
                      help='Hidden node dimension: default=%default')
    parser.add_option('-e', dest='epochs', default=10,
                      help='Number of epochs: default=%default')
    parser.add_option('-i', dest='iter_display', default=5000,
                      help='Number of iterations between output: default=%default')
    parser.add_option('-o', dest='optimization', default='sgd',
                      help='Optimization method [sgd|sgdm|adagrad]: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.1,
                      help='Initial learning rate: default=%default')
    parser.add_option('--decay', dest='decay', default=1.00,
                      help='Learning rate decay: default=%default')
    parser.add_option('--momentum', dest='momentum', default=0.5,
                      help='Momentum parameter (sgdm only): default=%default')
    parser.add_option('--word2vec_file', dest='word2vec_file', default='',
                      help='Location of word2vec file: default=do not load')
    parser.add_option('--glove_file', dest='glove_file', default='',
                      help='Location of glove file: default=do not load')
    parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False,
                      help='Save loaded vectors for faster loading next time: default=%default')
    parser.add_option('-s', dest='seed', default=42,
                      help='Random seed: default=%default')
    parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False,
                      help='Skip the evaluation between epochs: default=%default')
    parser.add_option('--test_fold', dest='test_fold', default=0,
                      help='Test fold: default=%default')
    parser.add_option('--dev_fold', dest='dev_fold', default=0,
                      help='Dev fold: default=%default')
    parser.add_option('--n_labels', dest='n_labels', default=14,
                      help='Number of labels to use (max 15): default=%default')
    parser.add_option('--w_word', dest='w_word', default=1.0,
                      help='Weight on word prediction: default=%default')
    parser.add_option('--w_sentence', dest='w_sentence', default=1.0,
                      help='Weight on word prediction: default=%default')
    parser.add_option('--w_article', dest='w_article', default=1.0,
                      help='Weight on word prediction: default=%default')


    (options, args) = parser.parse_args()
    project_name = args[0]
    input_filename = args[1]
    dirs.make_base_dir(project_name)
    sents_dir = dirs.data_raw_sentences_dir

    seed = int(options.seed)
    n_epochs = int(options.epochs)
    alpha = float(options.alpha)
    lr = float(options.learning_rate)
    iter_display = int(options.iter_display)
    opti_method = options.optimization
    lr_decay = float(options.decay)
    momentum = float(options.momentum)
    no_eval = options.no_eval
    word2vec_file = options.word2vec_file
    glove_file = options.glove_file
    save_vectors = options.save_vectors
    test_fold = int(options.test_fold)
    dev_fold = int(options.dev_fold)
    n_labels = int(options.n_labels)
    w_word = float(options.w_word)
    w_sentence = float(options.w_sentence)
    w_article = float(options.w_article)

    if seed > 0:
        np.random.seed(seed)
        random.seed(seed)

    dh = int(options.hidden_dim)
    dx = 300

    np.__config__.show()

    article_sent_words, article_word_labels, vocab, n_labels, n_unique_articles, annotation_counts = load_data(input_filename, n_labels)
    train_keys, dev_keys, test_keys = ds.get_all_splits(test_fold=test_fold, dev_subfold=dev_fold)

    vocab = vocab.keys()
    vocab.sort()
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))
    print "Vocab size =", vocab_size

    n_articles = len(article_sent_words)
    keys = article_sent_words.keys()
    keys.sort()
    print keys[:10]
    print "Loaded %d annotations for %d articles using %d labels" % (n_articles, n_unique_articles, n_labels)


    print list(train_keys)[:10]
    train_keys = [k for k in keys if k.split('__')[0] in train_keys]
    dev_keys = [k for k in keys if k.split('__')[0] in dev_keys]
    test_keys = [k for k in keys if k.split('__')[0] in test_keys]

    #dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist()
    #train_indices = list(set(range(n_articles)) - set(dev_indices))

    #train_keys = [keys[i] for i in train_indices]
    #dev_keys = [keys[i] for i in dev_indices]

    if glove_file != '':
        initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx)

    elif word2vec_file != '':
        initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx)

    else:
        initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename)
        vocab_size = len(vocab)

    if save_vectors:
        vector_utils.save_vectors(input_filename, initial_embeddings, vocab)

    # index words into vocabulary and make mask and label arrays
    idxs_dict = {}
    mask_dict = {}
    label_dict = {}
    for key, sent_words in article_sent_words.items():
        n_sents = len(sent_words)
        max_len = max([len(s) for s in sent_words])
        word_idxs = np.zeros([max_len, n_sents], dtype=np.int32)
        mask = np.zeros([max_len, n_sents], dtype=np.int32)
        labels = np.zeros([max_len, n_sents, n_labels], dtype=np.int32)
        for s_i, s in enumerate(sent_words):
            n_words = len(s)
            word_idxs[:n_words, s_i] = [vocab_index[w] for w in s]
            mask[:n_words, s_i] = 1
            labels[:n_words, s_i, :] = article_word_labels[key][s_i][:, :]
        idxs_dict[key] = word_idxs
        mask_dict[key] = mask
        label_dict[key] = labels

    article_lengths = [(idxs_dict[k].size, k) for k in train_keys]
    article_lengths.sort()

    # create the LSTM
    theano_seed = np.random.randint(2 ** 30)
    print "Number of distributions =", 2
    print "Building RNN"

    optimizer, opti_params = get_optimizer(opti_method, momentum)
    bilstm = BiLSTM(vocab_size, dh, dx, n_labels, optimizer, opti_params, initial_embeddings=initial_embeddings,
                    alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum,
                    word_weight=w_word, sent_weight=w_sentence, article_weight=w_article)  # create RNN

    best_dev_f1 = np.zeros(n_labels)
    corr_test_f1 = np.zeros(n_labels)

    print "Training"
    for epoch in range(n_epochs):
        sum_log_loss = 0
        sum_loss = 0
        mistakes = 0
        # sort by keys on the first pass, then shuffle
        if epoch == 0:
            keys = [key for length, key in article_lengths]
        else:
            keys = train_keys
            random.shuffle(keys)
        print "epoch\titems\tloss\tl+reg\terrs"

        # consider each sentence in turn
        for k_i, k in enumerate(keys):
            idxs = idxs_dict[k]
            mask = mask_dict[k]
            word_labels = label_dict[k]

            p_word_labels, p_sent_labels, p_article_labels, log_loss, loss = bilstm.train(idxs, mask, word_labels, lr, 1)
            sum_log_loss += log_loss
            sum_loss += loss

            y_pred_words = np.array(p_word_labels > 0.5, dtype=int)  # (n_words, n_sents, n_labels)
            y_pred_sents = np.array(p_sent_labels > 0.5, dtype=int)
            y_pred_article = np.array(p_article_labels > 0.5, dtype=int)

            sent_labels = np.max(word_labels, axis=0)
            article_labels = np.max(sent_labels, axis=0)
            mistakes += np.sum(np.abs(article_labels - y_pred_article))/float(n_labels)

            to_print = False
            if k_i == 0 and to_print:
                print "\tTraining example:", k
                print article_labels
                print np.array(y_pred_article, dtype=int)
                max_len, n_sents = mask.shape
                for s_i in range(n_sents):
                    if np.max(y_pred_words[:, s_i, :]) == 1:
                        n_words = np.argmin(mask[:, s_i]) - 1
                        sentence = [vocab[c] for c in idxs[:n_words, s_i]]
                        print "Full:", k_i, ' '.join(sentence)
                        for code in range(n_labels):
                            if y_pred_sents[s_i, code] == 1:
                                highlight = [w if word_labels[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                                print '-------------------------------------'
                                print "True:", k_i, code, ' '.join(highlight)
                                highlight = [w if y_pred_words[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)]
                                #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)]
                                print '-------------------------------------'
                                print "Pred:", k_i, code, ' '.join(highlight)
                                print ""

            if k_i % iter_display == 0 and k_i > 0:
                d = float(k_i+1)
                print '%d\t%d\t%.4f\t%.4f\t%.4f' % \
                      (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d)

        if not no_eval:
            print "\nDev evaluation"
            valid_z_o_loss, valid_log_loss, valid_f1, valid_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, dev_keys, bilstm, vocab, annotation_counts)
            print "\nTest evaluation"
            test_z_o_loss, test_log_loss, test_f1, test_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, test_keys, bilstm, vocab, annotation_counts)
            print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tdev_f1=%.3f\ttest_log_loss=%.3f\ttest_0/1=%.3f\ttest_f1=%.3f\t') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1, test_log_loss, test_z_o_loss, test_f1)
            for k in range(n_labels):
                if valid_per_class_f1[k] > best_dev_f1[k]:
                    best_dev_f1[k] = valid_per_class_f1[k]
                    corr_test_f1[k] = test_per_class_f1[k]
            print "Best valid f1s:", best_dev_f1
            print "Corr. test f1s:", corr_test_f1

        # decay learning rate
        lr *= lr_decay
Пример #5
0
  def __init__(self, flags_obj, time_callback):
    standard_runnable.StandardRunnableWithWarmup.__init__(
        self,
        flags_obj.use_tf_while_loop,
        flags_obj.use_tf_function)

    self.strategy = tf.distribute.get_strategy()
    self.flags_obj = flags_obj
    self.dtype = flags_core.get_tf_dtype(flags_obj)
    self.time_callback = time_callback

    # Input pipeline related
    batch_size = flags_obj.batch_size
    if batch_size % self.strategy.num_replicas_in_sync != 0:
      raise ValueError(
          'Batch size must be divisible by number of replicas : {}'.format(
              self.strategy.num_replicas_in_sync))

    steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
    if train_epochs > 1:
      train_epochs = flags_obj.train_epochs

    # As auto rebatching is not supported in
    # `experimental_distribute_datasets_from_function()` API, which is
    # required when cloning dataset to multiple workers in eager mode,
    # we use per-replica batch size.
    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)

    self.synthetic_input_fn = common.get_synth_input_fn(
        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
        num_channels=imagenet_preprocessing.NUM_CHANNELS,
        num_classes=self.flags_obj.num_classes,
        dtype=self.dtype,
        drop_remainder=True)

    if self.flags_obj.use_synthetic_data:
      self.input_fn = self.synthetic_input_fn
    else:
      self.input_fn = imagenet_preprocessing.input_fn

    resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
    self.model = resnet_model.resnet50(
        num_classes=self.flags_obj.num_classes,
        batch_size=flags_obj.batch_size,
        use_l2_regularizer=not flags_obj.single_l2_loss_op)

    self.use_lars_optimizer = False
    self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
    if self.flags_obj.optimizer == 'LARS':
      self.use_lars_optimizer = True
    self.optimizer, _ = common.get_optimizer(
        flags_obj=flags_obj,
        steps_per_epoch=steps_per_epoch,
        train_steps=steps_per_epoch * train_epochs)
    # Make sure iterations variable is created inside scope.
    self.global_step = self.optimizer.iterations

    if self.dtype == tf.float16:
      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
      self.optimizer = (
          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
              self.optimizer, loss_scale))
    elif flags_obj.fp16_implementation == 'graph_rewrite':
      # `dtype` is still float32 in this case. We built the graph in float32
      # and let the graph rewrite change parts of it float16.
      if not flags_obj.use_tf_function:
        raise ValueError('--fp16_implementation=graph_rewrite requires '
                         '--use_tf_function to be true')
      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
      self.optimizer = (
          tf.train.experimental.enable_mixed_precision_graph_rewrite(
              self.optimizer, loss_scale))

    self.one_hot = False
    self.label_smoothing = flags_obj.label_smoothing
    if self.label_smoothing and self.label_smoothing > 0:
      self.one_hot = True

    if flags_obj.report_accuracy_metrics:
      self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
      if self.one_hot:
        self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
            'train_accuracy', dtype=tf.float32)
      else:
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'train_accuracy', dtype=tf.float32)
      self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
    else:
      self.train_loss = None
      self.train_accuracy = None
      self.test_loss = None

    if self.one_hot:
      self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
          'test_accuracy', dtype=tf.float32)
    else:
      self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
          'test_accuracy', dtype=tf.float32)
    # self.test_corrects = tf.keras.metrics.Sum(
    #     'test_corrects', dtype=tf.float32)
    self.num_eval_steps = common.get_num_eval_steps(flags_obj)

    self.checkpoint = tf.train.Checkpoint(
        model=self.model, optimizer=self.optimizer)

    # Handling epochs.
    self.epoch_steps = steps_per_epoch
    self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)

    self.steps_per_loop = flags_obj.steps_per_loop
    profile_steps = flags_obj.profile_steps
    if profile_steps:
      profile_steps = [int(i) for i in profile_steps.split(',')]
      self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
      self.trace_end_step = profile_steps[1]
    else:
      self.trace_start_step = None
      self.trace_end_step = None

    self.epochs_between_evals = flags_obj.epochs_between_evals
    self.training_vars = self.model.trainable_variables
    self.accum_grads = []
    self.accum_grads_dtype = tf.float32

    if self.num_accumulation_steps > 1:
      for var in self.training_vars:
        self.accum_grads.append(self.optimizer.add_weight(
            name=var.name + '_accum',
            shape=var.shape,
            dtype=self.accum_grads_dtype,
            initializer='zeros',
            trainable=False,
            synchronization=tf.VariableSynchronization.ON_READ,
            aggregation=tf.VariableAggregation.SUM))
def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
  print('@@@@enable_eager = {}'.format(flags_obj.enable_eager))
  keras_utils.set_session_config(
      enable_eager=flags_obj.enable_eager,
      enable_xla=flags_obj.enable_xla)

  dtype = flags_core.get_tf_dtype(flags_obj)
  if dtype == tf.float16:
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        'mixed_float16')
    tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
  elif dtype == tf.bfloat16:
    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
        'mixed_bfloat16')
    tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

  # This only affects GPU.
  common.set_cudnn_batchnorm_mode()

  # TODO(anj-s): Set data_format without using Keras.
  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first'
                   if tf.test.is_built_with_cuda() else 'channels_last')
  tf.keras.backend.set_image_data_format(data_format)

  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=flags_obj.distribution_strategy,
      num_gpus=flags_obj.num_gpus,
      num_workers=distribution_utils.configure_cluster(),
      all_reduce_alg=flags_obj.all_reduce_alg,
      num_packs=flags_obj.num_packs,
      tpu_address=flags_obj.tpu)

  train_ds, test_ds = get_input_dataset(flags_obj, strategy)
  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
      flags_obj)
  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
  logging.info("Training %d epochs, each epoch has %d steps, "
               "total steps: %d; Eval %d steps",
               train_epochs, per_epoch_steps, train_epochs * per_epoch_steps,
               eval_steps)

  time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
                                          flags_obj.log_steps)

  with distribution_utils.get_strategy_scope(strategy):
    resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
    use_l2_regularizer = not flags_obj.single_l2_loss_op

    if flags_obj.use_resnet_d:
      resnetd = network_tweaks.ResnetD(image_data_format=tf.keras.backend.image_data_format(),
                                       use_l2_regularizer=use_l2_regularizer)
    else:
      resnetd = None

    model = resnet_model.resnet50(
        num_classes=imagenet_preprocessing.NUM_CLASSES,
        batch_size=flags_obj.batch_size,
        zero_gamma=flags_obj.zero_gamma,
        last_pool_channel_type=flags_obj.last_pool_channel_type,
        use_l2_regularizer=use_l2_regularizer,
        resnetd=resnetd)

    if flags_obj.learning_rate_decay_type == 'piecewise':
        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)
    elif flags_obj.learning_rate_decay_type == 'cosine':
        lr_schedule = common.CosineDecayWithWarmup(
            base_lr=flags_obj.base_learning_rate,
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            train_epochs=flags_obj.train_epochs,
            compute_lr_on_cpu=True)
    else:
        raise NotImplementedError


    optimizer = common.get_optimizer(lr_schedule)

    if dtype == tf.float16:
      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
      optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
          optimizer, loss_scale)
    elif flags_obj.fp16_implementation == 'graph_rewrite':
      # `dtype` is still float32 in this case. We built the graph in float32 and
      # let the graph rewrite change parts of it float16.
      if not flags_obj.use_tf_function:
        raise ValueError('--fp16_implementation=graph_rewrite requires '
                         '--use_tf_function to be true')
      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
          optimizer, loss_scale)

    current_step = 0
    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
    latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
    if latest_checkpoint:
      checkpoint.restore(latest_checkpoint)
      logging.info("Load checkpoint %s", latest_checkpoint)
      current_step = optimizer.iterations.numpy()

    train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
    test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)

    categorical_cross_entopy_and_acc = losses.CategoricalCrossEntropyAndAcc(
                                          batch_size=flags_obj.batch_size,
                                          num_classes=imagenet_preprocessing.NUM_CLASSES,
                                          label_smoothing=flags_obj.label_smoothing)
    trainable_variables = model.trainable_variables

    def step_fn(inputs):
      """Per-Replica StepFn."""
      images, labels = inputs
      with tf.GradientTape() as tape:
        logits = model(images, training=True)
        loss = categorical_cross_entopy_and_acc.loss_and_update_acc(labels, logits, training=True)
        #loss = tf.reduce_sum(prediction_loss) * (1.0/ flags_obj.batch_size)
        num_replicas = tf.distribute.get_strategy().num_replicas_in_sync

        if flags_obj.single_l2_loss_op:
          l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.add_n([
              tf.nn.l2_loss(v)
              for v in trainable_variables
              if 'bn' not in v.name
          ])

          loss += (l2_loss / num_replicas)
        else:
          loss += (tf.reduce_sum(model.losses) / num_replicas)

        # Scale the loss
        if flags_obj.dtype == "fp16":
          loss = optimizer.get_scaled_loss(loss)

      grads = tape.gradient(loss, trainable_variables)

      # Unscale the grads
      if flags_obj.dtype == "fp16":
        grads = optimizer.get_unscaled_gradients(grads)

      optimizer.apply_gradients(zip(grads, trainable_variables))
      train_loss.update_state(loss)

    @tf.function
    def train_steps(iterator, steps):
      """Performs distributed training steps in a loop."""
      for _ in tf.range(steps):
        strategy.experimental_run_v2(step_fn, args=(next(iterator),))

    def train_single_step(iterator):
      if strategy:
        strategy.experimental_run_v2(step_fn, args=(next(iterator),))
      else:
        return step_fn(next(iterator))

    def test_step(iterator):
      """Evaluation StepFn."""
      def step_fn(inputs):
        images, labels = inputs
        logits = model(images, training=False)
        loss = categorical_cross_entopy_and_acc.loss_and_update_acc(labels, logits, training=False)
        #loss = tf.reduce_sum(loss) * (1.0/ flags_obj.batch_size)
        test_loss.update_state(loss)

      if strategy:
        strategy.experimental_run_v2(step_fn, args=(next(iterator),))
      else:
        step_fn(next(iterator))

    if flags_obj.use_tf_function:
      train_single_step = tf.function(train_single_step)
      test_step = tf.function(test_step)

    if flags_obj.enable_tensorboard:
      summary_writer = tf.summary.create_file_writer(flags_obj.model_dir)
    else:
      summary_writer = None

    train_iter = iter(train_ds)
    time_callback.on_train_begin()
    for epoch in range(current_step // per_epoch_steps, train_epochs):
      train_loss.reset_states()
      categorical_cross_entopy_and_acc.training_accuracy.reset_states()

      steps_in_current_epoch = 0
      while steps_in_current_epoch < per_epoch_steps:
        time_callback.on_batch_begin(
            steps_in_current_epoch+epoch*per_epoch_steps)
        steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps,
                              steps_per_loop)
        if steps == 1:
          train_single_step(train_iter)
        else:
          # Converts steps to a Tensor to avoid tf.function retracing.
          train_steps(train_iter, tf.convert_to_tensor(steps, dtype=tf.int32))
        time_callback.on_batch_end( steps_in_current_epoch+epoch*per_epoch_steps)
        steps_in_current_epoch += steps

      #temp_loss = array_ops.identity(categorical_cross_entopy_and_acc.training_loss).numpy()
      #temp_loss = categorical_cross_entopy_and_acc.training_loss.numpy()
      logging.info('Training loss: %s, accuracy: %s, cross_entropy: %s at epoch %d',
                   train_loss.result().numpy(),
                   categorical_cross_entopy_and_acc.training_accuracy.result().numpy(),
                   0.,
                   epoch + 1)

      if (not flags_obj.skip_eval and
          (epoch + 1) % flags_obj.epochs_between_evals == 0):
        test_loss.reset_states()
        categorical_cross_entopy_and_acc.test_accuracy.reset_states()

        test_iter = iter(test_ds)
        for _ in range(eval_steps):
          test_step(test_iter)

        logging.info('Test loss: %s, accuracy: %s%% at epoch: %d',
                     test_loss.result().numpy(),
                     categorical_cross_entopy_and_acc.test_accuracy.result().numpy(),
                     epoch + 1)

      if flags_obj.enable_checkpoint_and_export:
        checkpoint_name = checkpoint.save(
            os.path.join(flags_obj.model_dir,
                         'model.ckpt-{}'.format(epoch + 1)))
        logging.info('Saved checkpoint to %s', checkpoint_name)

      if summary_writer:
        current_steps = steps_in_current_epoch + (epoch * per_epoch_steps)
        with summary_writer.as_default():
          #tf.summary.scalar('train_cross_entropy', categorical_cross_entopy_and_acc.training_loss.numpy(), current_steps)
          tf.summary.scalar('train_loss', train_loss.result(), current_steps)
          tf.summary.scalar('train_accuracy', categorical_cross_entopy_and_acc.training_accuracy.result(),
                            current_steps)
          lr_for_monitor = lr_schedule(current_steps)
          if callable(lr_for_monitor):
            lr_for_monitor = lr_for_monitor()
          tf.summary.scalar('learning_rate', lr_for_monitor, current_steps)
          tf.summary.scalar('eval_loss', test_loss.result(), current_steps)
          tf.summary.scalar(
              'eval_accuracy', categorical_cross_entopy_and_acc.test_accuracy.result(), current_steps)

    time_callback.on_train_end()
    if summary_writer:
      summary_writer.close()

    eval_result = None
    train_result = None
    if not flags_obj.skip_eval:
      eval_result = [test_loss.result().numpy(),
                     categorical_cross_entopy_and_acc.test_accuracy.result().numpy()]
      train_result = [train_loss.result().numpy(),
                      categorical_cross_entopy_and_acc.training_accuracy.result().numpy()]

    stats = build_stats(train_result, eval_result, time_callback)
    return stats