示例#1
0
def load_data(input_path, max_files, comm_rank=-1):
    load_data_timer_logger = logger(comm_rank, "Load Data", -1, True)
    load_data_timer_logger.start_timer()

    #look for labels and data files
    files = sorted([x for x in os.listdir(input_path) if x.startswith("data")])

    #we will choose to load only the first p files
    files = files[:max_files]

    #convert to numpy
    files = np.asarray(files)

    #PERMUTATION OF DATA
    np.random.seed(12345)
    shuffle_indices = np.random.permutation(len(files))
    np.save("./shuffle_indices.npy", shuffle_indices)
    files = files[shuffle_indices]

    #Create train/validation/test split
    size = len(files)
    trn_data = files[:int(0.8 * size)]
    tst_data = files[int(0.8 * size):int(0.9 * size)]
    val_data = files[int(0.9 * size):]

    load_data_timer_logger.end_timer()

    return trn_data, val_data, tst_data
示例#2
0
def create_dataset(h5ir,
                   datafilelist,
                   batchsize,
                   num_epochs,
                   comm_size,
                   comm_rank,
                   dtype,
                   shuffle=False):
    create_dataset_timer_logger = logger(comm_rank, "Create Dataset", -1, True)
    create_dataset_timer_logger.start_timer()

    if comm_size > 1:
        # use an equal number of files per shard, leaving out any leftovers
        per_shard = len(datafilelist) // comm_size
        sublist = datafilelist[0:per_shard * comm_size]
        dataset = tf.data.Dataset.from_tensor_slices(sublist)
        dataset = dataset.shard(comm_size, comm_rank)
    else:
        dataset = tf.data.Dataset.from_tensor_slices(datafilelist)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.map(map_func=lambda dataname: tuple(
        tf.py_func(h5ir.read, [dataname], [dtype, tf.int32, dtype])),
                          num_parallel_calls=4)
    dataset = dataset.prefetch(16)
    # make sure all batches are equal in size
    dataset = dataset.apply(
        tf.contrib.data.batch_and_drop_remainder(batchsize))
    dataset = dataset.repeat(num_epochs)

    create_dataset_timer_logger.end_timer()

    return dataset
    def global_shuffle(self):
        global_shuffle_time_logger = logger(self._taskid, "Global Shuffle",
                                            self._epochs_completed)
        global_shuffle_time_logger.start_timer()

        self._num_files = len(self._full_filelist)
        shuffled_file_id_list = self._shuffle_rng.permutation(self._num_files)
        start = 0
        end = self._num_files

        shuffled_filelist = []

        for id in shuffled_file_id_list:
            shuffled_filelist.append(self._full_filelist[id])

        self._full_filelist = shuffled_filelist

        if self._split_filelist:
            self._num_files = int(
                np.floor(len(self._full_filelist) / float(self._num_tasks)))
            start = self._taskid * self._num_files
            end = start + self._num_files

        assert self._num_files > 0, ('filelist is empty')

        self._filelist = self._full_filelist[start:end]

        global_shuffle_time_logger.end_timer()
示例#4
0
def _h5_input_subprocess_reader(path,
                                channels,
                                weights,
                                minvals,
                                maxvals,
                                update_on_read,
                                dtype,
                                comm_rank=-1):
    #begin_time = time.time()

    # need to send the comm_rank here
    image_reader_timer_logger = logger(comm_rank, "Time to Read Single Image",
                                       -1, True)
    image_reader_timer_logger.start_timer()

    with h5.File(path,
                 "r",
                 driver="core",
                 backing_store=False,
                 libver="latest") as f:
        #get min and max values and update stored values
        if update_on_read:
            minvals = np.minimum(minvals, f['climate']['stats'][channels, 0])
            maxvals = np.maximum(maxvals, f['climate']['stats'][channels, 1])

        #get data
        if 'channels' in f['climate']:
            # some channels have been dropped from the file, so map to the
            #  actual locations in the file array
            channel_list = list(f['climate']['channels'])
            channels = [channel_list.index(c) for c in channels]
        data = f['climate']['data'][channels, :, :]

        # cast data if needed
        if data.dtype != dtype:
            #data = f['climate']['data'][channels,:,:].astype(dtype)
            data = data.astype(dtype)

        #do min/max normalization
        for c in range(len(channels)):
            data[c, :, :] = (data[c, :, :] - minvals[c]) / (maxvals[c] -
                                                            minvals[c])

        #get label
        label = f['climate']['labels'][...]
        if label.dtype != np.int32:
            label = label.astype(np.int32)

    #get weights - choose per-channel based on the labels
    weights = weights[label]

    #time
    #end_time = time.time()
    #print "Time to read image %.3f s" % (end_time-begin_time)

    image_reader_timer_logger.end_timer()

    return data, label, weights, minvals, maxvals
    def next_batch(self, batch_size):
        next_batch_time_logger = logger(self._taskid, "Call to Next Batch",
                                        self._epochs_completed)
        next_batch_time_logger.start_timer()
        """Return the next `batch_size` examples from this data set."""
        start = self._data_index
        self._data_index += batch_size
        end = int(np.min([self._num_examples, self._data_index]))

        #take what is there
        images = self._images[start:end]
        labels = self._labels[start:end]
        normweights = self._normweights[start:end]
        weights = self._weights[start:end]
        psr = self._psr[start:end]

        if self._data_index > self._num_examples:
            #remains:
            remaining = self._data_index - self._num_examples

            #first, reset data_index and increase file index:
            self._data_index = 0
            self._file_index += 1

            #check if we are at the end of the file list
            if self._file_index >= self._num_files:
                #epoch is finished
                self._epochs_completed += 1
                #reset file index and shuffle list
                self._file_index = 0

                # Shuffle the full filelist and redistribute the files to the nodes
                if self._global_shuffle:
                    self.global_shuffle()

                # Local shuffle again as done before
                np.random.shuffle(self._filelist)

            #load the next file
            self.load_next_file()
            #assert batch_size <= self._num_examples
            #call rerucsively
            tmpimages, tmplabels, tmpnormweights, tmpweights, tmppsr = self.next_batch(
                remaining)
            #join
            images = np.concatenate([images, tmpimages], axis=0)
            labels = np.concatenate([labels, tmplabels], axis=0)
            normweights = np.concatenate([normweights, tmpnormweights], axis=0)
            weights = np.concatenate([weights, tmpweights], axis=0)
            psr = np.concatenate([psr, tmppsr], axis=0)

        next_batch_time_logger.end_timer()

        return images, labels, normweights, weights, psr
示例#6
0
    def read(self, datafile):
        read_image_timer_logger = logger(
            self.comm_rank, "Parallel Read Images with 4 Threads", -1, True)
        read_image_timer_logger.start_timer()

        path = self.path + '/' + datafile
        #begin_time = time.time()
        #nvtx.RangePush('h5_input', 8)
        data, label, weights, new_minvals, new_maxvals = self.pool.apply(
            _h5_input_subprocess_reader,
            (path, self.channels, self.weights, self.minvals, self.maxvals,
             self.update_on_read, self.dtype, self.comm_rank))
        if self.update_on_read:
            self.minvals = np.minimum(self.minvals, new_minvals)
            self.maxvals = np.maximum(self.maxvals, new_maxvals)
        #nvtx.RangePop()
        #end_time = time.time()
        #print "Time to read %s = %.3f s" % (path, end_time-begin_time)

        read_image_timer_logger.end_timer()
        return data, label, weights
    def next_batch(self, batch_size):
        dummy_set_next_batch_time_logger = logger(-1,
                                                  "Dummy DataSet Next Batch")
        dummy_set_next_batch_time_logger.start_timer()

        data = np.reshape(self._random.rand(self._datasize * batch_size),
                          [batch_size] + self._shape)
        labels = np.expand_dims(self._random.random_integers(0, 1, batch_size),
                                1)
        normweights = np.expand_dims(self._random.rand(batch_size), 1)
        weights = normweights
        psr = labels

        #increase data counter and check if epoch finished
        self._data_index += batch_size
        if self._data_index >= self._samples_per_epoch:
            self._data_index = 0
            self._epochs_completed += 1

        dummy_set_next_batch_time_logger.end_timer()
        return data, labels, normweights, weights, psr
示例#8
0
def load_model(sess, saver, checkpoint_dir, comm_rank=-1):
    load_model_timer_logger = logger(comm_rank, "Load Model", -1, True)
    load_model_timer_logger.start_timer()

    print("Looking for model in {}".format(checkpoint_dir))
    #get list of checkpoints
    checkpoints = [
        x.replace(".index", "") for x in os.listdir(checkpoint_dir)
        if x.startswith("model.ckpt") and x.endswith(".index")
    ]
    checkpoints = sorted([(int(x.split("-")[1]), x) for x in checkpoints],
                         key=lambda tup: tup[0])
    latest_ckpt = os.path.join(checkpoint_dir, checkpoints[-1][1])
    print("Restoring model {}".format(latest_ckpt))
    try:
        saver.restore(sess, latest_ckpt)
        print("Model restoration successful.")
    except:
        print("Loading model failed, starting fresh.")

    load_model_timer_logger.end_timer()
示例#9
0
    def sequential_read(self, datafile):
        read_image_timer_logger = logger(self.comm_rank,
                                         "Sequential Read Images", -1, True)
        read_image_timer_logger.start_timer()

        #data
        #begin_time = time.time()
        with h5.File(self.path + '/' + datafile,
                     "r",
                     driver="core",
                     backing_store=False,
                     libver="latest") as f:
            #get min and max values and update stored values
            if self.update_on_read:
                self.minvals = np.minimum(
                    self.minvals, f['climate']['stats'][self.channels, 0])
                self.maxvals = np.maximum(
                    self.maxvals, f['climate']['stats'][self.channels, 1])
            #get data
            data = f['climate']['data'][self.channels, :, :].astype(np.float32)
            #do min/max normalization
            for c in range(len(self.channels)):
                data[c, :, :] = (data[c, :, :] - self.minvals[c]) / (
                    self.maxvals[c] - self.minvals[c])

            #get label
            label = f['climate']['labels'][...].astype(np.int32)

            #get weights
            weights = np.zeros(label.shape, dtype=np.float32)
            for idx, w in enumerate(self.weights):
                weights[np.where(label == idx)] = w

        #time
        #end_time = time.time()
        #print "Time to read image %.3f s" % (end_time-begin_time)

        read_image_timer_logger.end_timer()

        return data, label, weights
def build_functions(args, variables, network):
    build_functions_time_logger = logger(int(args['task_index']),
                                         "Build Functions")
    build_functions_time_logger.start_timer()
    with args['graph'].as_default():

        #additional variables
        variables['labels_'] = tf.placeholder(
            tf.int32, shape=[args['train_batch_size_per_node'], 1])
        variables['weights_'] = tf.placeholder(
            args["precision"], shape=[args['train_batch_size_per_node'], 1])

        #loss function
        prediction = network[-1]
        tf.add_to_collection('prediction_op', prediction)

        #compute loss, important: use unscaled version!
        loss = tf.losses.sparse_softmax_cross_entropy(
            variables['labels_'], network[-2], weights=variables['weights_'])

        #compute accuracy
        accuracy = tf.metrics.accuracy(variables['labels_'],
                                       tf.round(prediction[:, 1]),
                                       weights=variables['weights_'],
                                       name='accuracy')

        #compute AUC
        auc = tf.metrics.auc(variables['labels_'],
                             prediction[:, 1],
                             weights=variables['weights_'],
                             num_thresholds=5000,
                             curve='ROC',
                             name='AUC')

    build_functions_time_logger.end_timer()
    #return functions
    return variables, prediction, loss, accuracy, auc
def build_cnn_model(args):
    build_cnn_model_time_logger = logger(int(args['task_index']),
                                         "Build CNN Model")
    build_cnn_model_time_logger.start_timer()
    #datatype
    dtype = args["precision"]

    #find out which device to use:
    device = '/cpu:0'
    if args['arch'] == 'gpu':
        device = '/gpu:0'

    #define empty variables dict
    variables = {}

    #rotate input shape depending on data format
    data_format = args['conv_params']['data_format']
    input_shape = args['input_shape']

    #create graph handle
    args['graph'] = tf.Graph()
    #KFAC stuff
    if args["optimizer"] == "KFAC":
        args["opt_args"][
            "layer_collection"] = tf.contrib.kfac.layer_collection.LayerCollection(
            )

    with args['graph'].as_default():

        #create placeholders
        variables['images_'] = tf.placeholder(
            dtype, shape=[args['train_batch_size_per_node']] + input_shape)
        variables['keep_prob_'] = tf.placeholder(dtype)

        #empty network:
        network = []

        #input layer
        network.append(
            tf.reshape(variables['images_'], [-1] + input_shape, name='input'))

        #get all the conv-args stuff:
        activation = args['conv_params']['activation']
        initializer = args['conv_params']['initializer']
        ksize = args['conv_params']['filter_size']
        num_filters = args['conv_params']['num_filters']
        padding = str(args['conv_params']['padding'])

        #conv layers:
        prev_num_filters = args['input_shape'][0]
        if data_format == "NHWC":
            prev_num_filters = args['input_shape'][2]

        for layerid in range(1, args['num_layers'] + 1):

            #create weight-variable
            #with tf.device(device):
            variables['conv' + str(layerid) + '_w'] = tf.Variable(
                initializer([ksize, ksize, prev_num_filters, num_filters],
                            dtype=dtype),
                name='conv' + str(layerid) + '_w',
                dtype=dtype)
            prev_num_filters = num_filters

            #conv unit
            network.append(
                tf.nn.conv2d(network[-1],
                             filter=variables['conv' + str(layerid) + '_w'],
                             strides=[1, 1, 1, 1],
                             padding=padding,
                             data_format=data_format,
                             name='conv' + str(layerid)))

            #batchnorm if desired
            outshape = network[-1].shape[1:]
            if args['batch_norm']:
                #add batchnorm
                #with tf.device(device):
                #mu
                variables['bn' + str(layerid) + '_m'] = tf.Variable(
                    tf.zeros(outshape, dtype=dtype),
                    name='bn' + str(layerid) + '_m',
                    dtype=dtype)
                #sigma
                variables['bn' + str(layerid) + '_s'] = tf.Variable(
                    tf.ones(outshape, dtype=dtype),
                    name='bn' + str(layerid) + '_s',
                    dtype=dtype)
                #gamma
                variables['bn' + str(layerid) + '_g'] = tf.Variable(
                    tf.ones(outshape, dtype=dtype),
                    name='bn' + str(layerid) + '_g',
                    dtype=dtype)
                #beta
                variables['bn' + str(layerid) + '_b'] = tf.Variable(
                    tf.zeros(outshape, dtype=dtype),
                    name='bn' + str(layerid) + '_b',
                    dtype=dtype)
                #add batch norm layer
                network.append(
                    tf.nn.batch_normalization(
                        network[-1],
                        mean=variables['bn' + str(layerid) + '_m'],
                        variance=variables['bn' + str(layerid) + '_s'],
                        offset=variables['bn' + str(layerid) + '_b'],
                        scale=variables['bn' + str(layerid) + '_g'],
                        variance_epsilon=1.e-4,
                        name='bn' + str(layerid)))
            else:
                bshape = (variables['conv' + str(layerid) + '_w'].shape[3])
                variables['conv' + str(layerid) + '_b'] = tf.Variable(
                    tf.zeros(bshape, dtype=dtype),
                    name='conv' + str(layerid) + '_b',
                    dtype=dtype)
                #add bias
                if dtype != tf.float16:
                    network.append(
                        tf.nn.bias_add(network[-1],
                                       variables['conv' + str(layerid) + '_b'],
                                       data_format=data_format))
                else:
                    print(
                        "Warning: bias-add currently snot supported for fp16!")

                if args["optimizer"] == "KFAC":
                    args["opt_args"]["layer_collection"].register_conv2d(
                        (variables['conv' + str(layerid) + '_w'],
                         variables['conv' + str(layerid) + '_b']),
                        [1, 1, 1, 1], padding, network[-3], network[-1])

            #add relu unit
            #with tf.device(device):
            network.append(activation(network[-1]))

            #add maxpool
            #with tf.device(device):
            kshape = [1, 1, 2, 2]
            sshape = [1, 1, 2, 2]
            if data_format == "NHWC":
                kshape = [1, 2, 2, 1]
                sshape = [1, 2, 2, 1]
            network.append(
                tf.nn.max_pool(network[-1],
                               ksize=kshape,
                               strides=sshape,
                               padding=args['conv_params']['padding'],
                               data_format=data_format,
                               name='maxpool' + str(layerid)))

            #add dropout
            #with tf.device(device):
            network.append(
                tf.nn.dropout(network[-1],
                              keep_prob=variables['keep_prob_'],
                              name='drop' + str(layerid)))

        if args['scaling_improvements']:
            #add another conv layer with average pooling to the mix
            #with tf.device(device):
            variables['conv' + str(layerid + 1) + '_w'] = tf.Variable(
                initializer([ksize, ksize, prev_num_filters, num_filters],
                            dtype=dtype),
                name='conv' + str(layerid + 1) + '_w',
                dtype=dtype)
            prev_num_filters = num_filters

            #conv unit
            network.append(
                tf.nn.conv2d(network[-1],
                             filter=variables['conv' + str(layerid + 1) +
                                              '_w'],
                             strides=[1, 1, 1, 1],
                             padding=padding,
                             data_format=data_format,
                             name='conv' + str(layerid + 1)))

            #bias
            bshape = (variables['conv' + str(layerid + 1) + '_w'].shape[3])
            variables['conv' + str(layerid + 1) + '_b'] = tf.Variable(
                tf.zeros(bshape, dtype=dtype),
                name='conv' + str(layerid + 1) + '_b',
                dtype=dtype)
            #add bias
            if dtype != tf.float16:
                network.append(
                    tf.nn.bias_add(network[-1],
                                   variables['conv' + str(layerid + 1) + '_b'],
                                   data_format=data_format))
            else:
                print("Warning: bias-add currently snot supported for fp16!")

            if args["optimizer"] == "KFAC":
                args["opt_args"]["layer_collection"].register_conv2d(
                    (variables['conv' + str(layerid + 1) + '_w'],
                     variables['conv' + str(layerid + 1) + '_b']),
                    [1, 1, 1, 1], padding, network[-3], network[-1])

            #add relu unit
            #with tf.device(device):
            network.append(activation(network[-1]))

            #add average-pool
            #with tf.device(device):
            #pool over everything
            imsize = network[-1].shape[2]
            kshape = [1, 1, imsize, imsize]
            sshape = [1, 1, imsize, imsize]
            if data_format == "NHWC":
                kshape = [1, imsize, imsize, 1]
                sshape = [1, imsize, imsize, 1]
            network.append(
                tf.nn.avg_pool(network[-1],
                               ksize=kshape,
                               strides=sshape,
                               padding=args['conv_params']['padding'],
                               data_format=data_format,
                               name='avgpool1'))

        #reshape
        outsize = np.prod(network[-1].shape[1:]).value
        #with tf.device(device):
        network.append(
            tf.reshape(network[-1], shape=[-1, outsize], name='flatten'))

        if not args['scaling_improvements']:
            #now do the MLP
            #fc1
            #with tf.device(device):
            variables['fc1_w'] = tf.Variable(initializer(
                [outsize, args['num_fc_units']], dtype=dtype),
                                             name='fc1_w',
                                             dtype=dtype)
            variables['fc1_b'] = tf.Variable(tf.zeros([args['num_fc_units']],
                                                      dtype=dtype),
                                             name='fc1_b',
                                             dtype=dtype)
            network.append(
                tf.matmul(network[-1], variables['fc1_w']) +
                variables['fc1_b'])
            if args["optimizer"] == "KFAC":
                args["opt_args"]["layer_collection"].register_fully_connected(
                    (variables['fc1_w'], variables['fc1_b']), network[-2],
                    network[-1])

            #add relu unit
            #with tf.device(device):
            network.append(activation(network[-1]))

            #add dropout
            #with tf.device(device):
            network.append(
                tf.nn.dropout(network[-1],
                              keep_prob=variables['keep_prob_'],
                              name='drop' + str(layerid)))
            #fc2
            #with tf.device(device):
            variables['fc2_w'] = tf.Variable(initializer(
                [args['num_fc_units'], 2], dtype=dtype),
                                             name='fc2_w',
                                             dtype=dtype)
            variables['fc2_b'] = tf.Variable(tf.zeros([2], dtype=dtype),
                                             name='fc2_b',
                                             dtype=dtype)
            network.append(
                tf.matmul(network[-1], variables['fc2_w']) +
                variables['fc2_b'])
            if args["optimizer"] == "KFAC":
                args["opt_args"]["layer_collection"].register_fully_connected(
                    (variables['fc2_w'], variables['fc2_b']), network[-2],
                    network[-1])

        else:
            #only one FC layer here
            #with tf.device(device):
            variables['fc1_w'] = tf.Variable(initializer([outsize, 2],
                                                         dtype=dtype),
                                             name='fc1_w',
                                             dtype=dtype)
            variables['fc1_b'] = tf.Variable(tf.zeros([2], dtype=dtype),
                                             name='fc1_b',
                                             dtype=dtype)
            network.append(
                tf.matmul(network[-1], variables['fc1_w']) +
                variables['fc1_b'])
            if args["optimizer"] == "KFAC":
                args["opt_args"]["layer_collection"].register_fully_connected(
                    (variables['fc1_w'], variables['fc1_b']), network[-2],
                    network[-1])

        #register logits for KFAC:
        if args["optimizer"] == "KFAC":
            args["opt_args"][
                "layer_collection"].register_categorical_predictive_distribution(
                    network[-1], name="logits")

        #add softmax
        #with tf.device(device):
        network.append(tf.nn.softmax(network[-1]))
    build_cnn_model_time_logger.end_timer()
    #return the network and variables
    return variables, network
def parse_arguments():
    parse_arg_logger = logger(-1, "Parse Arguments")
    parse_arg_logger.start_timer()

    parser = argparse.ArgumentParser()
    parser.add_argument("--config",
                        type=str,
                        help="specify a config file in json format")
    parser.add_argument("--num_tasks",
                        type=int,
                        default=1,
                        help="specify the number of tasks")
    parser.add_argument(
        "--precision",
        type=str,
        default="fp32",
        help="specify the precision. supported are fp32 and fp16")
    parser.add_argument('--dummy_data',
                        action='store_const',
                        const=True,
                        default=False,
                        help='use dummy data instead of real data')
    parser.add_argument("--disable_training",
                        help="Disable training for test purpose",
                        action='store_true')
    parser.add_argument("--enable_tf_timeline",
                        help="Enable Timeline module for tracing TF workflow",
                        action='store_true')
    pargs = parser.parse_args()

    #load the json:
    with open(pargs.config, "r") as f:
        args = json.load(f)

    #set the rest
    args['num_tasks'] = pargs.num_tasks
    args['num_ps'] = 0
    args['dummy_data'] = pargs.dummy_data
    args['disable_training'] = pargs.disable_training
    args['enable_tf_timeline'] = pargs.enable_tf_timeline

    #modify the activations
    if args['conv_params']['activation'] == 'ReLU':
        args['conv_params']['activation'] = tf.nn.relu
    else:
        raise ValueError('Only ReLU is supported as activation')

    #modify the initializers
    if args['conv_params']['initializer'] == 'HE':
        args['conv_params']['initializer'] = tfk.initializers.he_normal()
    else:
        raise ValueError('Only ReLU is supported as initializer')

    #modify the optimizers
    args['opt_args'] = {"learning_rate": args['learning_rate']}
    if args['optimizer'] == 'KFAC':
        args['opt_func'] = tf.contrib.kfac.optimizer.KfacOptimizer
        args['opt_args']['cov_ema_decay'] = args['cov_ema_decay']
        args['opt_args']['damping'] = args['damping']
        args['opt_args']['momentum'] = args['momentum']
    elif args['optimizer'] == 'ADAM':
        args['opt_func'] = tf.train.AdamOptimizer
    else:
        raise ValueError('Only ADAM and KFAC are supported as optimizer')

    #now, see if all the paths are there
    args['logpath'] = args['outputpath'] + '/logs'
    args['modelpath'] = args['outputpath'] + '/models'

    if not os.path.isdir(args['logpath']):
        print("Creating log directory ", args['logpath'])
        os.makedirs(args['logpath'])
    if not os.path.isdir(args['modelpath']):
        print("Creating model directory ", args['modelpath'])
        os.makedirs(args['modelpath'])
    if not os.path.isdir(args['inputpath']) and not args['dummy_data']:
        raise ValueError(
            "Please specify a valid path with input files in hdf5 format")

    #precision:
    args['precision'] = tf.float32
    if pargs.precision == "fp16":
        args['precision'] = tf.float16

    parse_arg_logger.end_timer()
    return args
示例#13
0
def _h5_input_subprocess_reader(path,
                                channels,
                                weights,
                                minvals,
                                maxvals,
                                update_on_read,
                                dtype,
                                comm_rank=-1):
    #begin_time = time.time()

    # need to send the comm_rank here
    image_reader_timer_logger = logger(comm_rank, "Time to Read Single Image",
                                       -1, True)
    image_reader_timer_logger.start_timer()
    #Edited on July 16 2018, by Jialin Liu
    #Replacing following lines till 158 with fake I/O, to avoid I/O calls to file systems
    #
    dsize = [4, 768, 1152]  # Hard coded size for climate->data
    data = np.random.rand(len(channels), dsize[1], dsize[2])
    #data = f['climate']['data'][channels,:,:]

    # cast data if needed
    if data.dtype != dtype:
        #data = f['climate']['data'][channels,:,:].astype(dtype)
        data = data.astype(dtype)

    #do min/max normalization
    for c in range(len(channels)):
        data[c, :, :] = (data[c, :, :] - minvals[c]) / (maxvals[c] -
                                                        minvals[c])

    # get label dataet shape
    # label_shape = f['climate']['labels'].shape
    label_shape = [768, 1152]  # Hard coded size of climate->labels
    # generate same size of label data in memory
    label = np.random.rand(label_shape[0], label_shape[1])
    #get label
    #label = f['climate']['labels'][...]

    if label.dtype != np.int32:
        label = label.astype(np.int32)

    # with h5.File(path, "r", driver="core", backing_store=False, libver="latest") as f:
    #     #get min and max values and update stored values
    #     if update_on_read:
    #         minvals = np.minimum(minvals, f['climate']['stats'][channels,0])
    #         maxvals = np.maximum(maxvals, f['climate']['stats'][channels,1])

    #     #get data
    #     if 'channels' in f['climate']:
    #         # some channels have been dropped from the file, so map to the
    #         #  actual locations in the file array
    #         channel_list = list(f['climate']['channels'])
    #         channels = [ channel_list.index(c) for c in channels ]
    #     data = f['climate']['data'][channels,:,:]

    #     # cast data if needed
    #     if data.dtype != dtype:
    #         #data = f['climate']['data'][channels,:,:].astype(dtype)
    #         data = data.astype(dtype)

    #     #do min/max normalization
    #     for c in range(len(channels)):
    #         data[c,:,:] = (data[c,:,:]-minvals[c])/(maxvals[c]-minvals[c])

    #     #get label
    #     label = f['climate']['labels'][...]
    #     if label.dtype != np.int32:
    #         label = label.astype(np.int32)

    #get weights - choose per-channel based on the labels
    weights = weights[label]

    #time
    #end_time = time.time()
    #print "Time to read image %.3f s" % (end_time-begin_time)

    image_reader_timer_logger.end_timer()

    return data, label, weights, minvals, maxvals
示例#14
0
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

            training_loop_timer_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    io_training_time_logger.end_timer()
    global_time_logger.end_timer()


if __name__ == '__main__':
    argparse_timer_logger = logger(-1, "Parse Arguments", -1, True)
    argparse_timer_logger.start_timer()

    AP = argparse.ArgumentParser()
    AP.add_argument("--lr", default=1e-4, type=float, help="Learning rate")
    AP.add_argument("--blocks",
                    default=[3, 3, 4, 4, 7, 7, 10],
                    type=int,
                    nargs="*",
                    help="Number of layers per block")
    AP.add_argument("--output",
                    type=str,
                    default='output',
                    help="Defines the location and name of output directory")
    AP.add_argument(
        "--chkpt",
def train_loop(sess, train_step, global_step, optlist, args, trainset,
               validationset, disable_training, enable_tf_timeline):
    train_loop_logger = logger(int(args["task_index"]), "Train Loop")
    train_loop_logger.start_timer()

    options = None
    run_metadata = None
    many_runs_timeline = None

    if enable_tf_timeline:
        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        many_runs_timeline = timeliner()

    #counter stuff
    trainset.reset()
    validationset.reset()

    #restore weights belonging to graph
    epochs_completed = 0
    if not args['restart']:
        last_model = tf.train.latest_checkpoint(args['modelpath'])
        print("Restoring model %s.", last_model)
        model_saver.restore(sess, last_model)

    #losses
    train_loss = 0.
    train_batches = 0
    total_batches = 0
    train_time = 0

    #do training
    while not sess.should_stop():
        train_iteration_logger = logger(int(args['task_index']),
                                        "Training Iteration", epochs_completed)
        train_iteration_logger.start_timer()

        #increment total batch counter
        total_batches += 1

        #get next batch
        images, labels, normweights, _, _ = trainset.next_batch(
            args['train_batch_size_per_node'])
        #set weights to zero
        normweights[:] = 1.
        #set up feed dict:
        feed_dict = {
            variables['images_']: images,
            variables['labels_']: labels,
            variables['weights_']: normweights,
            variables['keep_prob_']: args['dropout_p']
        }

        if not disable_training:
            #update weights
            start_time = time.time()
            if args['create_summary']:
                _, gstep, summary, tmp_loss = sess.run(
                    [train_step, global_step, train_summary, loss_fn],
                    feed_dict=feed_dict,
                    options=options,
                    run_metadata=run_metadata)

                if enable_tf_timeline:
                    fetched_timeline = timeline.Timeline(
                        run_metadata.step_stats)
                    chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    )
                    many_runs_timeline.update_timeline(chrome_trace)
            else:
                _, gstep, tmp_loss = sess.run(
                    [train_step, global_step, loss_fn],
                    feed_dict=feed_dict,
                    options=options,
                    run_metadata=run_metadata)

                if enable_tf_timeline:
                    fetched_timeline = timeline.Timeline(
                        run_metadata.step_stats)
                    chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    )
                    many_runs_timeline.update_timeline(chrome_trace)

            #update kfac parameters
            if optlist:
                sess.run(optlist[0],
                         feed_dict=feed_dict,
                         options=options,
                         run_metadata=run_metadata)

                if enable_tf_timeline:
                    fetched_timeline = timeline.Timeline(
                        run_metadata.step_stats)
                    chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    )
                    many_runs_timeline.update_timeline(chrome_trace)

                if gstep % args["kfac_inv_update_frequency"] == 0:
                    sess.run(optlist[1],
                             feed_dict=feed_dict,
                             options=options,
                             run_metadata=run_metadata)

                    if enable_tf_timeline:
                        fetched_timeline = timeline.Timeline(
                            run_metadata.step_stats)
                        chrome_trace = fetched_timeline.generate_chrome_trace_format(
                        )
                        many_runs_timeline.update_timeline(chrome_trace)

        end_time = time.time()
        train_time += end_time - start_time

        #increment train loss and batch number
        train_loss += tmp_loss
        train_batches += 1

        #determine if we give a short update:
        if gstep % args['display_interval'] == 0:
            print(
                time.time(), "REPORT rank", args["task_index"],
                "global step %d., average training loss %g (%.3f sec/batch)" %
                (gstep, train_loss / float(train_batches),
                 train_time / float(train_batches)))

        #check if epoch is done
        if trainset._epochs_completed > epochs_completed:
            epochs_completed = trainset._epochs_completed
            print(
                time.time(), "COMPLETED rank", args["task_index"],
                "epoch %d, average training loss %g (%.3f sec/batch)" %
                (epochs_completed, train_loss / float(train_batches),
                 train_time / float(train_batches)))

            #reset counters
            train_loss = 0.
            train_batches = 0
            train_time = 0

            #compute validation loss:
            #reset variables
            validation_loss = 0.
            validation_batches = 0

            #iterate over batches
            while True:
                #get next batch
                images, labels, normweights, weights, _ = validationset.next_batch(
                    args['validation_batch_size_per_node'])
                #set weights to 1:
                normweights[:] = 1.
                weights[:] = 1.

                if not disable_training:
                    #compute loss
                    if args['create_summary']:
                        summary, tmp_loss = sess.run(
                            [validation_summary, loss_fn],
                            feed_dict={
                                variables['images_']: images,
                                variables['labels_']: labels,
                                variables['weights_']: normweights,
                                variables['keep_prob_']: 1.0
                            })
                    else:
                        tmp_loss = sess.run(
                            [loss_fn],
                            feed_dict={
                                variables['images_']: images,
                                variables['labels_']: labels,
                                variables['weights_']: normweights,
                                variables['keep_prob_']: 1.0
                            })

                    #add loss
                    validation_loss += tmp_loss[0]
                    validation_batches += 1

                    #update accuracy
                    sess.run(accuracy_fn[1],
                             feed_dict={
                                 variables['images_']: images,
                                 variables['labels_']: labels,
                                 variables['weights_']: normweights,
                                 variables['keep_prob_']: 1.0
                             })

                    #update auc
                    sess.run(auc_fn[1],
                             feed_dict={
                                 variables['images_']: images,
                                 variables['labels_']: labels,
                                 variables['weights_']: normweights,
                                 variables['keep_prob_']: 1.0
                             })

                #check if full pass done
                if validationset._epochs_completed > 0:
                    validationset.reset()
                    break

            print(
                time.time(), "COMPLETED epoch %d, average validation loss %g" %
                (epochs_completed,
                 validation_loss / float(validation_batches)))
            validation_accuracy = sess.run(accuracy_fn[0])
            print(
                time.time(), "COMPLETED epoch %d, average validation accu %g" %
                (epochs_completed, validation_accuracy))
            validation_auc = sess.run(auc_fn[0])
            print(
                time.time(), "COMPLETED epoch %d, average validation auc %g" %
                (epochs_completed, validation_auc))

        if enable_tf_timeline:
            many_runs_timeline.save('Timeliner_output.json')

        train_iteration_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    train_loop_logger.end_timer()
示例#16
0
def main(input_path, blocks, weights, image_dir, checkpoint_dir, trn_sz,
         learning_rate, loss_type, fs_type, opt_type, batch, batchnorm,
         num_epochs, dtype, chkpt, filter_sz, growth, disable_training,
         enable_tf_timeline):
    options = None
    run_metadata = None
    many_runs_timeline = None

    timeline_trace_fp = open("timeline_trace.pickle", "wb")

    options, run_metadata, many_runs_timeline, min_timeline_step, max_timeline_step = \
        init_timeline_configs(enable_tf_timeline, tf.RunOptions.FULL_TRACE, -1, -1)

    global_time_logger = logger(-1, "Global Total Time", -1, True)
    global_time_logger.start_timer()

    #init horovod

    initialization_timer_logger = logger(-1, "Initialize Horovod", -1, True)
    initialization_timer_logger.start_timer()

    nvtx.RangePush("init horovod", 1)
    comm_rank = 0
    comm_local_rank = 0
    comm_size = 1
    comm_local_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank()
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        #not all horovod versions have that implemented
        try:
            comm_local_size = hvd.local_size()
        except:
            comm_local_size = 1
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".
                  format(comm_size, comm_rank))
    nvtx.RangePop()  # init horovod

    initialization_timer_logger.set_rank(int(comm_rank))
    initialization_timer_logger.end_timer()

    global_time_logger.set_rank(int(comm_rank))

    #parameters
    channels = [0, 1, 2, 10]
    per_rank_output = False
    loss_print_interval = 1

    #session config

    initialization_timer_logger.start_timer(comm_rank, "Configure Session")

    sess_config = tf.ConfigProto(
        inter_op_parallelism_threads=6,  #1
        intra_op_parallelism_threads=1,  #6
        log_device_placement=False,
        allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)

    initialization_timer_logger.end_timer()

    #get data

    initialization_timer_logger.start_timer(comm_rank, "Get Data")

    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    trn_data, val_data, tst_data = load_data(input_path, trn_sz, comm_rank)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")

    initialization_timer_logger.end_timer()

    #print some stats
    if comm_rank == 0:
        print("Learning Rate: {}".format(learning_rate))
        print("Num workers: {}".format(comm_size))
        print("Local batch size: {}".format(batch))
        if dtype == tf.float32:
            print("Precision: {}".format("FP32"))
        else:
            print("Precision: {}".format("FP16"))
        print("Batch normalization: {}".format(batchnorm))
        print("Blocks: {}".format(blocks))
        print("Growth rate: {}".format(growth))
        print("Filter size: {}".format(filter_sz))
        print("Channels: {}".format(channels))
        print("Loss type: {}".format(loss_type))
        print("Loss weights: {}".format(weights))
        print("Optimizer type: {}".format(opt_type))
        print("Num training samples: {}".format(trn_data.shape[0]))
        print("Num validation samples: {}".format(val_data.shape[0]))

    io_training_time_logger = logger(comm_rank, "IO and Training", -1, True)
    io_training_time_logger.start_timer()

    with training_graph.as_default():
        nvtx.RangePush("TF Init", 3)
        #create readers
        trn_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        val_reader = h5_input_reader(input_path,
                                     channels,
                                     weights,
                                     dtype,
                                     normalization_file="stats.h5",
                                     update_on_read=False,
                                     comm_rank=comm_rank)
        #create datasets
        if fs_type == "local":
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_local_size,
                                         comm_local_rank,
                                         dtype,
                                         shuffle=False)
        else:
            trn_dataset = create_dataset(trn_reader,
                                         trn_data,
                                         batch,
                                         num_epochs,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=True)
            val_dataset = create_dataset(val_reader,
                                         val_data,
                                         batch,
                                         1,
                                         comm_size,
                                         comm_rank,
                                         dtype,
                                         shuffle=False)

        #create iterators
        handle = tf.placeholder(tf.string,
                                shape=[],
                                name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(
            handle, (dtype, tf.int32, dtype),
            ((batch, len(channels), image_height, image_width),
             (batch, image_height, image_width),
             (batch, image_height, image_width)))
        next_elem = iterator.get_next()

        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #set up model
        logit, prediction = create_tiramisu(3,
                                            next_elem[0],
                                            image_height,
                                            image_width,
                                            len(channels),
                                            loss_weights=weights,
                                            nb_layers_per_block=blocks,
                                            p=0.2,
                                            wd=1e-4,
                                            dtype=dtype,
                                            batchnorm=batchnorm,
                                            growth_rate=growth,
                                            filter_sz=filter_sz,
                                            comm_rank=comm_rank)

        #set up loss
        labels_one_hot = tf.cast(tf.contrib.layers.one_hot_encoding(
            next_elem[1], 3),
                                 dtype=dtype)
        loss = None
        if loss_type == "weighted":
            loss = tf.losses.softmax_cross_entropy(
                onehot_labels=labels_one_hot,
                logits=logit,
                weights=next_elem[2])
        elif loss_type == "focal":
            loss = focal_loss(onehot_labels=labels_one_hot,
                              logits=logit,
                              alpha=1.,
                              gamma=2.)
        else:
            raise ValueError("Error, loss type {} not supported.",
                             format(loss_type))
        if horovod:
            loss_avg = hvd.allreduce(tf.cast(loss, tf.float32))
        else:
            loss_avg = tf.identity(loss)

        #set up global step
        global_step = tf.train.get_or_create_global_step()

        #set up optimizer
        if opt_type.startswith("LARC"):
            if comm_rank == 0:
                print("Enabling LARC")
            train_op = get_larc_optimizer(opt_type.split("-")[1],
                                          loss,
                                          global_step,
                                          learning_rate,
                                          LARC_mode="clip",
                                          LARC_eta=0.002,
                                          LARC_epsilon=1. / 16000.)
        else:
            train_op = get_optimizer(opt_type, loss, global_step,
                                     learning_rate)
        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1],
                                                    predictions=tf.argmax(
                                                        prediction, axis=3),
                                                    num_classes=3,
                                                    weights=None,
                                                    metrics_collections=None,
                                                    updates_collections=None,
                                                    name="iou_score")
        iou_reset_op = tf.variables_initializer([
            i for i in tf.local_variables() if i.name.startswith('iou_score/')
        ])

        if horovod:
            iou_avg = hvd.allreduce(iou_op)
        else:
            iou_avg = tf.identity(iou_op)

        #compute epochs and stuff:
        if fs_type == "local":
            num_samples = trn_data.shape[0] // comm_local_size
        else:
            num_samples = trn_data.shape[0] // comm_size
        #num_steps_per_epoch = num_samples // batch
        num_steps_per_epoch = 10
        num_steps = num_epochs * num_steps_per_epoch
        if per_rank_output:
            print("Rank {} does {} steps per epoch".format(
                comm_rank, num_steps_per_epoch))

        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)]
        #bcast init for bcasting the model after start
        init_bcast = hvd.broadcast_global_variables(0)
        #initializers:
        init_op = tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()

        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = num_steps_per_epoch * 2
            checkpoint_saver = tf.train.Saver(max_to_keep=1000)
            listener = checkpoint_listener(comm_rank, True)
            hooks.append(
                tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir,
                                             save_steps=checkpoint_save_freq,
                                             saver=checkpoint_saver,
                                             listeners=[listener]))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)

        ##DEBUG
        ##summary
        #if comm_rank == 0:
        #    print("write graph for debugging")
        #    tf.summary.scalar("loss",loss)
        #    summary_op = tf.summary.merge_all()
        #    #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op))
        #    with tf.Session(config=sess_config) as sess:
        #        sess.run([init_op, init_local_op])
        #        #create iterator handles
        #        trn_handle = sess.run(trn_handle_string)
        #        #init iterators
        #        sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels})
        #        #summary:
        #        sess.run(summary_op, feed_dict={handle: trn_handle})
        #        #summary file writer
        #        summary_writer = tf.summary.FileWriter('./logs', sess.graph)
        ##DEBUG

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config,
                                               hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])

            #restore from checkpoint:
            if comm_rank == 0:
                load_model(sess, checkpoint_saver, checkpoint_dir, comm_rank)
            #broadcast loaded model variables
            sess.run(init_bcast)

            #create iterator handles
            trn_handle, val_handle = sess.run(
                [trn_handle_string, val_handle_string],
                options=options,
                run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "create_iterator_handle.json")

            #init iterators
            sess.run(trn_init_op,
                     feed_dict={handle: trn_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_train_iterator_handle.json")

            sess.run(val_init_op,
                     feed_dict={handle: val_handle},
                     options=options,
                     run_metadata=run_metadata)

            update_timeline_in_range(enable_tf_timeline, run_metadata,
                                     many_runs_timeline,
                                     "init_val_iterator_handle.json")

            nvtx.RangePop()  # TF Init

            # do the training
            epoch = 1
            step = 1
            train_loss = 0.
            nvtx.RangePush("Training Loop", 4)
            nvtx.RangePush("Epoch", epoch)
            start_time = time.time()

            training_loop_timer_logger = logger(comm_rank, "Training Loop", -1,
                                                True)
            training_loop_timer_logger.start_timer()

            train_steps = 0
            while not (sess.should_stop()):
                #training loop
                try:
                    training_iteration_time_logger = logger(
                        comm_rank, "Training Iteration", epoch, True)
                    training_iteration_time_logger.start_timer()

                    nvtx.RangePush("Step", step)

                    if disable_training:
                        train_steps = sess.run([global_step],
                                               feed_dict={handle: trn_handle},
                                               options=options,
                                               run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps[0],
                            "train_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        train_steps_in_epoch = train_steps[
                            0] % num_steps_per_epoch

                        # do the validation phase
                        if train_steps_in_epoch == 0:
                            eval_steps = 0
                            while True:
                                try:
                                    sess.run([next_elem[1]],
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "val_dict" + str(eval_steps) + ".json")

                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline, "val_dict_out_" +
                                        str(eval_steps) + ".json")

                                    break

                    else:
                        # construct feed dict
                        _, train_steps, tmp_loss = sess.run(
                            [
                                train_op, global_step,
                                (loss if per_rank_output else loss_avg)
                            ],
                            feed_dict={handle: trn_handle},
                            options=options,
                            run_metadata=run_metadata)

                        update_timeline_in_range(
                            enable_tf_timeline, run_metadata,
                            many_runs_timeline, train_steps,
                            "val_" + str(global_step) + ".json",
                            min_timeline_step, max_timeline_step)

                        if comm_rank == 0:
                            step_trace_fp = open(
                                "train_step_trace_" + str(global_step) +
                                ".pickle", "wb")
                            pickle.dump(run_metadata, step_trace_fp)

                        train_steps_in_epoch = train_steps % num_steps_per_epoch
                        train_loss += tmp_loss
                        nvtx.RangePop()  # Step
                        step += 1

                        #print step report
                        eff_steps = train_steps_in_epoch if (
                            train_steps_in_epoch > 0) else num_steps_per_epoch
                        if (train_steps % loss_print_interval) == 0:
                            if per_rank_output:
                                print(
                                    "REPORT: rank {}, training loss for step {} (of {}) is {}, time {}"
                                    .format(comm_rank, train_steps, num_steps,
                                            train_loss / eff_steps,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "REPORT: training loss for step {} (of {}) is {}, time {}"
                                        .format(train_steps, num_steps,
                                                train_loss / eff_steps,
                                                time.time() - start_time))

                        #do the validation phase
                        if train_steps_in_epoch == 0:
                            end_time = time.time()
                            #print epoch report
                            train_loss /= num_steps_per_epoch
                            if per_rank_output:
                                print(
                                    "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {} s"
                                    .format(comm_rank, epoch, num_epochs,
                                            train_loss,
                                            time.time() - start_time))
                            else:
                                if comm_rank == 0:
                                    print(
                                        "COMPLETED: training loss for epoch {} (of {}) is {}, time {} s"
                                        .format(epoch, num_epochs, train_loss,
                                                time.time() - start_time))

                            #evaluation loop
                            eval_loss = 0.
                            eval_steps = 0
                            nvtx.RangePush("Eval Loop", 7)
                            timeline_help_count = 0
                            while True:
                                try:
                                    #construct feed dict
                                    _, tmp_loss, val_model_predictions, val_model_labels = sess.run(
                                        [
                                            iou_update_op,
                                            (loss
                                             if per_rank_output else loss_avg),
                                            prediction, next_elem[1]
                                        ],
                                        feed_dict={handle: val_handle},
                                        options=options,
                                        run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        timeline_help_count,
                                        "train_" + str(global_step) + ".json",
                                        min_timeline_step, max_timeline_step)

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_" +
                                            str(global_step) + ".pickle", "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    timeline_help_count += 1

                                    #print some images
                                    if comm_rank == 0:
                                        if have_imsave:
                                            imsave(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            imsave(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                val_model_labels[0, ...] * 100)
                                            imsave(
                                                image_dir +
                                                '/test_combined_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.png',
                                                colormap[
                                                    val_model_labels[0, ...],
                                                    np.argmax(
                                                        val_model_predictions[
                                                            0, ...],
                                                        axis=2)])
                                        else:
                                            np.save(
                                                image_dir +
                                                '/test_pred_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                np.argmax(
                                                    val_model_predictions[0,
                                                                          ...],
                                                    axis=2) * 100)
                                            np.save(
                                                image_dir +
                                                '/test_label_epoch' +
                                                str(epoch) + '_estep' +
                                                str(eval_steps) + '_rank' +
                                                str(comm_rank) + '.npy',
                                                val_model_labels[0, ...] * 100)

                                    eval_loss += tmp_loss
                                    eval_steps += 1
                                except tf.errors.OutOfRangeError:
                                    eval_steps = np.max([eval_steps, 1])
                                    eval_loss /= eval_steps
                                    if per_rank_output:
                                        print(
                                            "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, eval_loss))
                                    else:
                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation loss for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    eval_loss))
                                    if per_rank_output:
                                        iou_score = sess.run(iou_op)

                                        print(
                                            "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}"
                                            .format(comm_rank, epoch,
                                                    num_epochs, iou_score))
                                    else:
                                        iou_score = sess.run(iou_avg)

                                        if comm_rank == 0:
                                            print(
                                                "COMPLETED: evaluation IoU for epoch {} (of {}) is {}"
                                                .format(
                                                    epoch, num_epochs,
                                                    iou_score))
                                    sess.run(iou_reset_op)

                                    sess.run(val_init_op,
                                             feed_dict={handle: val_handle},
                                             options=options,
                                             run_metadata=run_metadata)

                                    update_timeline_in_range(
                                        enable_tf_timeline, run_metadata,
                                        many_runs_timeline,
                                        "train_" + str(global_step) + ".json")

                                    if comm_rank == 0:
                                        step_trace_fp = open(
                                            "validation_step_trace_out.pickle",
                                            "wb")
                                        pickle.dump(run_metadata,
                                                    step_trace_fp)

                                    break
                            nvtx.RangePop()  # Eval Loop

                    if enable_tf_timeline:
                        many_runs_timeline.save('Timeliner_output.json')

                    # reset counters
                    epoch += 1
                    train_loss = 0.
                    step = 0

                    nvtx.RangePop()  # Epoch
                    nvtx.RangePush("Epoch", epoch)

                    training_iteration_time_logger.end_timer()

                except tf.errors.OutOfRangeError:
                    break

            nvtx.RangePop()  # Epoch
            nvtx.RangePop()  # Training Loop

            training_loop_timer_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    io_training_time_logger.end_timer()
    global_time_logger.end_timer()
示例#17
0
def create_tiramisu(nb_classes,
                    img_input,
                    height,
                    width,
                    nc,
                    loss_weights,
                    nb_dense_block=6,
                    growth_rate=16,
                    nb_filter=48,
                    nb_layers_per_block=5,
                    p=None,
                    wd=0.,
                    training=True,
                    batchnorm=False,
                    dtype=tf.float16,
                    filter_sz=3,
                    comm_rank=-1):
    create_tiramisu_timer_logger = logger(comm_rank, "Create Tiramisu", -1,
                                          True)
    create_tiramisu_timer_logger.start_timer()

    if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
        nb_layers = list(nb_layers_per_block)
    else:
        nb_layers = [nb_layers_per_block] * nb_dense_block

    with tf.variable_scope("tiramisu",
                           custom_getter=float32_variable_storage_getter):

        with tf.variable_scope("conv_input") as scope:
            x = conv(img_input, nb_filter, sz=filter_sz, wd=wd)
            if batchnorm:
                x = tf.layers.batch_normalization(x, axis=1, training=training)
            x = tf.nn.relu(x)
            if p: x = tf.layers.dropout(x, rate=p, training=training)

        with tf.name_scope("down_path") as scope:
            skips, added = down_path(x,
                                     nb_layers,
                                     growth_rate,
                                     p,
                                     wd,
                                     training=training,
                                     bn=batchnorm,
                                     filter_sz=filter_sz)

        with tf.name_scope("up_path") as scope:
            x = up_path(added,
                        reverse(skips[:-1]),
                        reverse(nb_layers[:-1]),
                        growth_rate,
                        p,
                        wd,
                        training=training,
                        bn=batchnorm,
                        filter_sz=filter_sz)

        with tf.name_scope("conv_output") as scope:
            x = conv(x, nb_classes, sz=1, wd=wd)
            if p: x = tf.layers.dropout(x, rate=p, training=training)
            _, f, r, c = x.get_shape().as_list()
        #x = tf.reshape(x,[-1,nb_classes,image_height,image_width]) #nb_classes was last before
        x = tf.transpose(
            x, [0, 2, 3, 1]
        )  #necessary because sparse softmax cross entropy does softmax over last axis

    create_tiramisu_timer_logger.end_timer()
    return x, tf.nn.softmax(x)
    def load_next_file(self):
        load_file_time_logger = logger(self._taskid, "Load File",
                                       self._epochs_completed)
        load_file_time_logger.start_timer()

        file_access_time_logger = logger(self._taskid, "HDF5 File Read",
                                         self._epochs_completed)
        file_access_time_logger.start_timer()

        #only load a new file if there are more than one file in the list:
        if self._num_files > 1 or not self._initialized:
            try:
                with h5.File(self._filelist[self._file_index], 'r') as f:
                    #determine total array size:
                    numentries = f['data'].shape[0]

                    if self._split_file:
                        blocksize = int(
                            np.ceil(numentries / float(self._num_tasks)))
                        start = self._taskid * blocksize
                        end = (self._taskid + 1) * blocksize
                    else:
                        start = 0
                        end = numentries

                    #load the chunk which is needed
                    self._images = f['data'][start:end]
                    self._labels = f['label'][start:end]
                    self._normweights = f['normweight'][start:end]
                    self._weights = f['weight'][start:end]
                    self._psr = f['psr'][start:end]
                    f.close()
            except EnvironmentError:
                raise EnvironmentError("Cannot open file " +
                                       self._filelist[self._file_index])

            file_access_time_logger.end_timer()

            #sanity checks
            assert self._images.shape[0] == self._labels.shape[0], (
                'images.shape: %s labels.shape: %s' %
                (self._images.shape, self_.labels.shape))
            assert self._labels.shape[0] == self._normweights.shape[0], (
                'labels.shape: %s normweights.shape: %s' %
                (self._labels.shape, self._normweights.shape))
            assert self._labels.shape[0] == self._psr.shape[0], (
                'labels.shape: %s psr.shape: %s' %
                (self._labels.shape, self._psr.shape))
            self._initialized = True

            #set number of samples
            self._num_examples = self._labels.shape[0]

            #reshape labels and weights
            self._labels = np.expand_dims(self._labels,
                                          axis=1).astype(np.int32, copy=False)
            self._normweights = np.expand_dims(self._normweights, axis=1)
            self._weights = np.expand_dims(self._weights, axis=1)
            self._psr = np.expand_dims(self._psr, axis=1)

            #transpose images if data format is NHWC
            if self._data_format == "NHWC":
                #transform for NCHW to NHWC
                self._images = np.transpose(self._images, (0, 2, 3, 1))

        #create permutation
        perm = np.arange(self._num_examples)
        np.random.shuffle(perm)
        #shuffle
        self._images = self._images[perm]
        self._labels = self._labels[perm]
        self._normweights = self._normweights[perm]
        self._weights = self._weights[perm]
        self._psr = self._psr[perm]

        load_file_time_logger.end_timer()
            print(
                time.time(), "COMPLETED epoch %d, average validation auc %g" %
                (epochs_completed, validation_auc))

        if enable_tf_timeline:
            many_runs_timeline.save('Timeliner_output.json')

        train_iteration_logger.end_timer()

    if enable_tf_timeline:
        many_runs_timeline.save('Timeliner_output.json')

    train_loop_logger.end_timer()


global_time_logger = logger(-1, "Global Total Time")
global_time_logger.start_timer()

# Parse Parameters

args = parse_arguments()

# Multi-Node Stuff

initialization_time_logger = logger(-1, "Server Initialization")
initialization_time_logger.start_timer()

#decide who will be worker and who will be parameters server
if args['num_tasks'] > 1:
    args['cluster'], args['server'], args['task_index'], args[
        'num_workers'], args['node_type'] = sc.setup_slurm_cluster(