示例#1
0
def train(model, input_dims, output_dims, seq_length, size, num_gpus, dataset,
          experiment_name, load_model, num_vids, n_epochs, split,
          base_data_path, f_name, learning_rate_init, wd, save_freq,
          clip_length, video_offset, clip_offset, num_clips, clip_stride,
          batch_size, loss_type, metrics_dir, loaded_checkpoint, verbose,
          opt_choice, gpu_list, grad_clip_value, preproc_method, random_init,
          shuffle_seed, preproc_debugging, reverse):
    """
    Training function used to train or fine-tune a chosen model
    Args:
        :model:              tf-activity-recognition framework model object
        :input_dims:         Number of frames used in input
        :output_dims:        Integer number of classes in current dataset
        :seq_length:         Length of output sequence expected from LSTM
        :size:               List detailing height and width of frame
        :num_gpus:           Number of gpus to use when training
        :dataset:            Name of dataset being processed
        :experiment_name:    Name of current experiment
        :load_model:         Boolean variable indicating whether to load from a checkpoint or not
        :num_vids:           Number of videos to be used for training
        :n_epochs:           Total number of epochs to train
        :split:              Split of dataset being used
        :base_data_path:     Full path to root directory containing datasets
        :f_name:             Specific video directory within a chosen split of a dataset
        :learning_rate_init: Initializer for learning rate
        :wd:                 Weight decay
        :save_freq:          Frequency, in epochs, with which to save
        :clip_length:        Length of clips to cut video into, -1 indicates using the entire video as one clip')
        :video_offset:       String indicating where to begin selecting video clips (provided clipOffset is None)
        :clip_offset:        "none" or "random" indicating where to begin selecting video clips
        :num_clips:          Number of clips to break video into
        :clip_stride:        Number of frames that overlap between clips, 0 indicates no overlap and negative values indicate a gap of frames between clips
        :batch_size:         Number of clips to load into the model each step.
        :loss_type:          String declaring loss type associated with a chosen model
        :metrics_dir:        Name of subdirectory within the experiment to store metrics. Unique directory names allow for parallel testing
        :loaded_checkpoint:  Specify the exact checkpoint of saved model to be loaded for further training/testing
        :verbose:            Boolean to indicate if all print statement should be procesed or not
        :opt_choice:         String indicating optimizer selected
        :gpu_list:           List of GPU IDs to be used
        :grad_clip_value:    Float value at which to clip normalized gradients
        :lr_boundaries:      List of epoch boundaries at which lr will be updated
        :lr_values:          List of lr multipliers to learning_rate_init at boundaries mentioned in lr_boundaries
        :preproc_method:     The preprocessing method to use, default, cvr, rr, sr, or any other custom preprocessing
        :random_init:        Randomly initialize model weights, not loading from any files (deafult False)
        :preproc_debugging:  Boolean indicating whether to load videos and clips in a queue or to load them directly for debugging (Default 0)
        :reverse:            Boolean indicating whether reverse videos and classify them as a new action class.

    Returns:
        Does not return anything
    """

    with tf.name_scope("my_scope") as scope:

        # Initializers for checkpoint and global step variable
        ckpt = None
        gs_init = 0

        ################################### Checkpoint loading block #######################################################

        # Load pre-trained/saved model to continue training (or fine-tune)
        if load_model:
            try:
                ckpt, gs_init, learning_rate_init = load_checkpoint(
                    model.name, dataset, experiment_name, loaded_checkpoint,
                    preproc_method)
                if verbose:
                    print 'A better checkpoint is found. The global_step value is: ' + str(
                        gs_init)

            except:
                if verbose:
                    print "Failed loading checkpoint requested. Please check."
                exit()

            # END TRY
        else:
            ckpt = model.load_default_weights()

        # END IF

        ######################################################################################################################

        # Initialize model variables
        global_step = tf.Variable(gs_init, name='global_step', trainable=False)
        number_of_videos = tf.Variable(num_vids,
                                       name='number_of_videos',
                                       trainable=False)
        number_of_epochs = tf.Variable(n_epochs,
                                       name='number_of_epochs',
                                       trainable=False)
        video_step = tf.Variable(1.0, name='video_step', trainable=False)
        istraining = True
        reuse_variables = None

        # TF session setup
        config = tf.ConfigProto(
            allow_soft_placement=True
        )  #, gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8))
        sess = tf.Session(config=config)
        init = tf.global_variables_initializer()

        # Variables get randomly initialized into tf graph
        sess.run(init)

        tower_losses = []
        tower_grads = []
        tower_slogits = []

        data_path = os.path.join(base_data_path, 'tfrecords_' + dataset,
                                 'Split' + str(split), f_name)

        # Setup tensors for models
        # input_data_tensor - [batchSize, inputDims, height, width, channels]
        input_data_tensor, labels_tensor, names_tensor = load_dataset(
            model,
            num_gpus,
            batch_size,
            output_dims,
            input_dims,
            seq_length,
            size,
            data_path,
            dataset,
            istraining,
            clip_length,
            video_offset,
            clip_offset,
            num_clips,
            clip_stride,
            video_step,
            preproc_debugging,
            shuffle_seed,
            verbose,
            reverse=reverse)

        ############### TO DO: FIX THIS ASAP ########################
        if ((batch_size == 1) and (num_clips == 1)):
            sess.run(tf.assign_add(video_step, -2))

        else:
            sess.run(tf.assign_add(video_step, -1))

        # END IF
        ############################################################

        learning_rate = tf.Variable(learning_rate_init,
                                    name='learning_rate',
                                    trainable=False)

        # Define optimizer (Current selection is only momentum optimizer)
        if opt_choice == 'gd':
            optimizer = lambda lr: tf.train.GradientDescentOptimizer(lr)

        elif opt_choice == 'adam':
            optimizer = lambda lr: tf.train.AdamOptimizer(lr)

        else:
            optimizer = lambda lr: tf.train.MomentumOptimizer(learning_rate=lr,
                                                              momentum=0.9)

        # END IF
        """ Multi-GPU setup: 1) Associate gpu device to specific model replica
                             2) Setup tower name scope for variables
        """

        ################# GPU list check block ####################

        assert ((len(gpu_list) == num_gpus) or (len(gpu_list) == 0))

        if len(gpu_list) == 0:
            gpu_list = [str(x) for x in range(num_gpus)]

        # END IF

        ###########################################################

        ################################################## Setup TF graph block ######################################################
        for gpu_idx in range(num_gpus):
            with tf.device('/gpu:' + str(gpu_list[gpu_idx])):
                with tf.name_scope('%s_%d' %
                                   ('tower', int(gpu_list[gpu_idx]))) as scope:
                    with tf.variable_scope(tf.get_variable_scope(),
                                           reuse=reuse_variables):
                        returned_layers = model.inference(
                            input_data_tensor[gpu_idx *
                                              batch_size:gpu_idx * batch_size +
                                              batch_size, :, :, :, :],
                            istraining,
                            input_dims,
                            output_dims,
                            seq_length,
                            scope,
                            return_layer=['logits'],
                            weight_decay=wd)

                        logits = tf.cast(returned_layers[0], tf.float32)

                        # Calculating Softmax for probability outcomes : Can be modified, make function internal to model
                        slogits = tf.nn.softmax(logits)

                    # END WITH

                    reuse_variables = True
                    """ Within GPU mini-batch: 1) Calculate loss,
                                               2) Initialize optimizer with required learning rate and
                                               3) Compute gradients
                                               4) Aggregate losses, gradients and logits
                    """

                    total_loss = model.loss(
                        logits, labels_tensor[gpu_idx *
                                              batch_size:gpu_idx * batch_size +
                                              batch_size, :], loss_type)
                    opt = optimizer(learning_rate)
                    gradients = opt.compute_gradients(
                        total_loss, vars_.trainable_variables())

                    tower_losses.append(total_loss)
                    tower_grads.append(gradients)
                    tower_slogits.append(slogits)

                # END WITH

            # END WITH

        # END FOR
        """  After: 1) Computing gradients and losses need to be stored and averaged
                    2) Clip gradients by norm to required value
                    3) Apply mean gradient updates
        """

        gradients = _average_gradients(tower_grads)
        gradients, variables = zip(*gradients)
        clipped_gradients, _ = clip_ops.clip_by_global_norm(
            gradients, grad_clip_value)
        gradients = list(zip(clipped_gradients, variables))
        grad_updates = opt.apply_gradients(gradients,
                                           global_step=global_step,
                                           name="train")
        train_op = grad_updates

        ############################################################################################################################################

        if save_bool:
            ######################### Logger Setup block ######################################

            # Logging setup initialization (Naming format: Date, month, hour, minute, second)
            log_name = (
                "exp_train_%s_%s_%s" %
                (time.strftime("%d_%m_%H_%M_%S"), dataset, experiment_name))
            make_dir('results')
            make_dir(os.path.join('results', model.name))
            make_dir(os.path.join('results', model.name, dataset))
            make_dir(
                os.path.join('results', model.name, dataset, preproc_method))
            make_dir(
                os.path.join('results', model.name, dataset, preproc_method,
                             experiment_name))
            make_dir(
                os.path.join('results', model.name, dataset, preproc_method,
                             experiment_name, 'checkpoints'))
            curr_logger = Logger(
                os.path.join('logs', model.name, dataset, preproc_method,
                             metrics_dir, log_name))

            ####################################################################################

        # END IF

        init = tf.global_variables_initializer()
        coord = tf.train.Coordinator()
        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)

        # Variables get randomly initialized into tf graph
        sess.run(init)

        # Check that weights were loaded or random initializations are requested
        if ((ckpt == None) or (random_init)):
            print "Caution: Model weights are not being loaded, using random initialization."

        else:
            # Model variables initialized from previous saved models
            initialize_from_dict(sess, ckpt, model.name)

        # END IF

        del ckpt

        # Initialize tracking variables
        previous_vid_name = ""
        videos_loaded = 0
        tot_count = 0
        acc = 0
        epoch_count = 0
        tot_load_time = 0.0
        tot_train_time = 0.0
        last_loss = None

        losses = []
        total_pred = []
        save_data = []
        total_params = []
        losses_tracker = []

        # Timing test setup
        time_init = time.time()

        batch_count = 0
        epoch_acc = 0
        l_r = learning_rate_init

        ########################################## Training loop block ################################################################

        # Loop epoch number of time over the training set
        while videos_loaded < n_epochs * num_vids:
            # Variable to update during epoch intervals
            if (epoch_count + 1) * num_vids <= videos_loaded < (
                    epoch_count + 1) * num_vids + num_gpus * batch_size:
                batch_count = 0
                epoch_acc = 0

                if epoch_count % save_freq == 0 and tot_count > 0:
                    if save_bool:
                        if verbose:
                            print "Saving..."

                        save_checkpoint(sess, model.name, dataset,
                                        experiment_name, preproc_method, l_r,
                                        global_step.eval(session=sess))

                # END IF

                epoch_count += 1

            # END IF

            time_pre_train = time.time()

            ######################################### Running TF training session block ##################################
            _, loss_train, predictions, gs, labels, vid_names, l_r, track_vars = sess.run(
                [
                    train_op, tower_losses, tower_slogits, global_step,
                    labels_tensor, names_tensor, learning_rate,
                    model.get_track_variables()
                ])

            ################################################################################################################

            if verbose:
                print vid_names

            for name in vid_names:
                if name != previous_vid_name:
                    videos_loaded += 1
                    previous_vid_name = name
                tot_count += 1

            ######## Adaptive Learning Rate Control Block ############################

            losses_tracker.append(np.mean(loss_train))

            if videos_loaded % 10 == 0 and videos_loaded > 0:
                if last_loss is None:
                    last_loss = sum(losses_tracker) / 10

                else:
                    difference_loss = last_loss - sum(losses_tracker) / 10
                    last_loss = sum(losses_tracker) / 10

                    if abs(difference_loss) < 0.001:
                        learning_rate /= 10

                    # END IF

                # END IF

                if len(losses_tracker) == 10:
                    losses_tracker = []

                # END IF

            # END IF

            ###########################################################################

            # Transpose the extracted layers such that the mean is taken across the gpus and over any matrix with more than 1 dimension
            params_array = []
            for key in track_vars.keys():
                curr_params = np.array(track_vars[key])
                if len(curr_params.shape) > 1:
                    indices = np.arange(len(curr_params.shape)) + 1
                    indices[-1] = 0
                    curr_params = curr_params.transpose(indices)
                    params_array.append(
                        np.mean(curr_params,
                                axis=tuple(range(len(curr_params.shape))[1:])))

                else:
                    params_array.append([np.mean(curr_params)])

                # END IF

            # END FOR

            #################### Training accuracy computation block ###############

            # Compute training epoch accuracy
            for gpu_pred_idx in range(len(predictions)):
                for batch_idx in range(predictions[gpu_pred_idx].shape[0]):
                    pred = np.mean(predictions[gpu_pred_idx][batch_idx],
                                   0).argmax()

                    if pred == labels[gpu_pred_idx * batch_size +
                                      batch_idx][0]:
                        epoch_acc += 1

                    # END IF

                    batch_count += 1

                # END FOR

            # END FOR

            ###################### Add variables to be tracked to logger #############

            time_post_train = time.time()
            tot_train_time += time_post_train - time_pre_train

            if verbose:
                print 'train_time: ', time_post_train - time_pre_train
                print 'step, loss: ', gs, loss_train
                print 'labels: ', labels

            # END IF

            if save_bool:
                curr_logger.add_scalar_value('train/train_time',
                                             time_post_train - time_pre_train,
                                             step=gs)
                curr_logger.add_scalar_value('train/loss',
                                             float(np.mean(loss_train)),
                                             step=gs)
                curr_logger.add_scalar_value('train/epoch_acc',
                                             epoch_acc / float(batch_count),
                                             step=gs)

                for layer in range(len(params_array)):
                    for p in range(len(params_array[layer])):
                        curr_logger.add_scalar_value(
                            'tracked_training_variables/' +
                            str(track_vars.keys()[layer] + '_' + str(p)),
                            float(params_array[layer][p]),
                            step=gs)

                    # END FOR

                # END FOR

                total_params.append(params_array)

                curr_logger.add_scalar_value(
                    'tracked_training_variables/learning_rate',
                    float(l_r),
                    step=gs)

            # END IF

        # END WHILE

        #########################################################################################################################################################

        if save_bool:
            if verbose:
                print "Saving..."

            # END IF

            save_checkpoint(sess, model.name, dataset, experiment_name,
                            preproc_method, l_r, gs)
            coord.request_stop()
            coord.join(threads)

        # END IF

        if verbose:
            print "Tot train time: ", tot_train_time
            print "Tot time:       ", time.time() - time_init

    # END WITH

        if save_bool:
            # Save tracked parameterization variables as a numpy file
            if len(total_params) != 0:
                total_params = np.array(total_params).flatten()
                make_dir(
                    os.path.join('results', model.name, dataset,
                                 preproc_method, experiment_name, metrics_dir))

                if os.path.isfile(
                        os.path.join('results', model.name, dataset,
                                     preproc_method, experiment_name,
                                     metrics_dir,
                                     'train_params_' + dataset + '.npy')):

                    loaded_params = np.load(
                        os.path.join('results', model.name, dataset,
                                     preproc_method, experiment_name,
                                     metrics_dir,
                                     'train_params_' + dataset + '.npy'))
                    total_params = np.concatenate(
                        [loaded_params, total_params])

        # END IF

                np.save(
                    os.path.join('results', model.name, dataset,
                                 preproc_method, experiment_name, metrics_dir,
                                 'train_params_' + dataset + '.npy'),
                    total_params)