def load_data(input_path, max_files, comm_rank=-1): load_data_timer_logger = logger(comm_rank, "Load Data", -1, True) load_data_timer_logger.start_timer() #look for labels and data files files = sorted([x for x in os.listdir(input_path) if x.startswith("data")]) #we will choose to load only the first p files files = files[:max_files] #convert to numpy files = np.asarray(files) #PERMUTATION OF DATA np.random.seed(12345) shuffle_indices = np.random.permutation(len(files)) np.save("./shuffle_indices.npy", shuffle_indices) files = files[shuffle_indices] #Create train/validation/test split size = len(files) trn_data = files[:int(0.8 * size)] tst_data = files[int(0.8 * size):int(0.9 * size)] val_data = files[int(0.9 * size):] load_data_timer_logger.end_timer() return trn_data, val_data, tst_data
def create_dataset(h5ir, datafilelist, batchsize, num_epochs, comm_size, comm_rank, dtype, shuffle=False): create_dataset_timer_logger = logger(comm_rank, "Create Dataset", -1, True) create_dataset_timer_logger.start_timer() if comm_size > 1: # use an equal number of files per shard, leaving out any leftovers per_shard = len(datafilelist) // comm_size sublist = datafilelist[0:per_shard * comm_size] dataset = tf.data.Dataset.from_tensor_slices(sublist) dataset = dataset.shard(comm_size, comm_rank) else: dataset = tf.data.Dataset.from_tensor_slices(datafilelist) if shuffle: dataset = dataset.shuffle(buffer_size=100) dataset = dataset.map(map_func=lambda dataname: tuple( tf.py_func(h5ir.read, [dataname], [dtype, tf.int32, dtype])), num_parallel_calls=4) dataset = dataset.prefetch(16) # make sure all batches are equal in size dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batchsize)) dataset = dataset.repeat(num_epochs) create_dataset_timer_logger.end_timer() return dataset
def global_shuffle(self): global_shuffle_time_logger = logger(self._taskid, "Global Shuffle", self._epochs_completed) global_shuffle_time_logger.start_timer() self._num_files = len(self._full_filelist) shuffled_file_id_list = self._shuffle_rng.permutation(self._num_files) start = 0 end = self._num_files shuffled_filelist = [] for id in shuffled_file_id_list: shuffled_filelist.append(self._full_filelist[id]) self._full_filelist = shuffled_filelist if self._split_filelist: self._num_files = int( np.floor(len(self._full_filelist) / float(self._num_tasks))) start = self._taskid * self._num_files end = start + self._num_files assert self._num_files > 0, ('filelist is empty') self._filelist = self._full_filelist[start:end] global_shuffle_time_logger.end_timer()
def _h5_input_subprocess_reader(path, channels, weights, minvals, maxvals, update_on_read, dtype, comm_rank=-1): #begin_time = time.time() # need to send the comm_rank here image_reader_timer_logger = logger(comm_rank, "Time to Read Single Image", -1, True) image_reader_timer_logger.start_timer() with h5.File(path, "r", driver="core", backing_store=False, libver="latest") as f: #get min and max values and update stored values if update_on_read: minvals = np.minimum(minvals, f['climate']['stats'][channels, 0]) maxvals = np.maximum(maxvals, f['climate']['stats'][channels, 1]) #get data if 'channels' in f['climate']: # some channels have been dropped from the file, so map to the # actual locations in the file array channel_list = list(f['climate']['channels']) channels = [channel_list.index(c) for c in channels] data = f['climate']['data'][channels, :, :] # cast data if needed if data.dtype != dtype: #data = f['climate']['data'][channels,:,:].astype(dtype) data = data.astype(dtype) #do min/max normalization for c in range(len(channels)): data[c, :, :] = (data[c, :, :] - minvals[c]) / (maxvals[c] - minvals[c]) #get label label = f['climate']['labels'][...] if label.dtype != np.int32: label = label.astype(np.int32) #get weights - choose per-channel based on the labels weights = weights[label] #time #end_time = time.time() #print "Time to read image %.3f s" % (end_time-begin_time) image_reader_timer_logger.end_timer() return data, label, weights, minvals, maxvals
def next_batch(self, batch_size): next_batch_time_logger = logger(self._taskid, "Call to Next Batch", self._epochs_completed) next_batch_time_logger.start_timer() """Return the next `batch_size` examples from this data set.""" start = self._data_index self._data_index += batch_size end = int(np.min([self._num_examples, self._data_index])) #take what is there images = self._images[start:end] labels = self._labels[start:end] normweights = self._normweights[start:end] weights = self._weights[start:end] psr = self._psr[start:end] if self._data_index > self._num_examples: #remains: remaining = self._data_index - self._num_examples #first, reset data_index and increase file index: self._data_index = 0 self._file_index += 1 #check if we are at the end of the file list if self._file_index >= self._num_files: #epoch is finished self._epochs_completed += 1 #reset file index and shuffle list self._file_index = 0 # Shuffle the full filelist and redistribute the files to the nodes if self._global_shuffle: self.global_shuffle() # Local shuffle again as done before np.random.shuffle(self._filelist) #load the next file self.load_next_file() #assert batch_size <= self._num_examples #call rerucsively tmpimages, tmplabels, tmpnormweights, tmpweights, tmppsr = self.next_batch( remaining) #join images = np.concatenate([images, tmpimages], axis=0) labels = np.concatenate([labels, tmplabels], axis=0) normweights = np.concatenate([normweights, tmpnormweights], axis=0) weights = np.concatenate([weights, tmpweights], axis=0) psr = np.concatenate([psr, tmppsr], axis=0) next_batch_time_logger.end_timer() return images, labels, normweights, weights, psr
def read(self, datafile): read_image_timer_logger = logger( self.comm_rank, "Parallel Read Images with 4 Threads", -1, True) read_image_timer_logger.start_timer() path = self.path + '/' + datafile #begin_time = time.time() #nvtx.RangePush('h5_input', 8) data, label, weights, new_minvals, new_maxvals = self.pool.apply( _h5_input_subprocess_reader, (path, self.channels, self.weights, self.minvals, self.maxvals, self.update_on_read, self.dtype, self.comm_rank)) if self.update_on_read: self.minvals = np.minimum(self.minvals, new_minvals) self.maxvals = np.maximum(self.maxvals, new_maxvals) #nvtx.RangePop() #end_time = time.time() #print "Time to read %s = %.3f s" % (path, end_time-begin_time) read_image_timer_logger.end_timer() return data, label, weights
def next_batch(self, batch_size): dummy_set_next_batch_time_logger = logger(-1, "Dummy DataSet Next Batch") dummy_set_next_batch_time_logger.start_timer() data = np.reshape(self._random.rand(self._datasize * batch_size), [batch_size] + self._shape) labels = np.expand_dims(self._random.random_integers(0, 1, batch_size), 1) normweights = np.expand_dims(self._random.rand(batch_size), 1) weights = normweights psr = labels #increase data counter and check if epoch finished self._data_index += batch_size if self._data_index >= self._samples_per_epoch: self._data_index = 0 self._epochs_completed += 1 dummy_set_next_batch_time_logger.end_timer() return data, labels, normweights, weights, psr
def load_model(sess, saver, checkpoint_dir, comm_rank=-1): load_model_timer_logger = logger(comm_rank, "Load Model", -1, True) load_model_timer_logger.start_timer() print("Looking for model in {}".format(checkpoint_dir)) #get list of checkpoints checkpoints = [ x.replace(".index", "") for x in os.listdir(checkpoint_dir) if x.startswith("model.ckpt") and x.endswith(".index") ] checkpoints = sorted([(int(x.split("-")[1]), x) for x in checkpoints], key=lambda tup: tup[0]) latest_ckpt = os.path.join(checkpoint_dir, checkpoints[-1][1]) print("Restoring model {}".format(latest_ckpt)) try: saver.restore(sess, latest_ckpt) print("Model restoration successful.") except: print("Loading model failed, starting fresh.") load_model_timer_logger.end_timer()
def sequential_read(self, datafile): read_image_timer_logger = logger(self.comm_rank, "Sequential Read Images", -1, True) read_image_timer_logger.start_timer() #data #begin_time = time.time() with h5.File(self.path + '/' + datafile, "r", driver="core", backing_store=False, libver="latest") as f: #get min and max values and update stored values if self.update_on_read: self.minvals = np.minimum( self.minvals, f['climate']['stats'][self.channels, 0]) self.maxvals = np.maximum( self.maxvals, f['climate']['stats'][self.channels, 1]) #get data data = f['climate']['data'][self.channels, :, :].astype(np.float32) #do min/max normalization for c in range(len(self.channels)): data[c, :, :] = (data[c, :, :] - self.minvals[c]) / ( self.maxvals[c] - self.minvals[c]) #get label label = f['climate']['labels'][...].astype(np.int32) #get weights weights = np.zeros(label.shape, dtype=np.float32) for idx, w in enumerate(self.weights): weights[np.where(label == idx)] = w #time #end_time = time.time() #print "Time to read image %.3f s" % (end_time-begin_time) read_image_timer_logger.end_timer() return data, label, weights
def build_functions(args, variables, network): build_functions_time_logger = logger(int(args['task_index']), "Build Functions") build_functions_time_logger.start_timer() with args['graph'].as_default(): #additional variables variables['labels_'] = tf.placeholder( tf.int32, shape=[args['train_batch_size_per_node'], 1]) variables['weights_'] = tf.placeholder( args["precision"], shape=[args['train_batch_size_per_node'], 1]) #loss function prediction = network[-1] tf.add_to_collection('prediction_op', prediction) #compute loss, important: use unscaled version! loss = tf.losses.sparse_softmax_cross_entropy( variables['labels_'], network[-2], weights=variables['weights_']) #compute accuracy accuracy = tf.metrics.accuracy(variables['labels_'], tf.round(prediction[:, 1]), weights=variables['weights_'], name='accuracy') #compute AUC auc = tf.metrics.auc(variables['labels_'], prediction[:, 1], weights=variables['weights_'], num_thresholds=5000, curve='ROC', name='AUC') build_functions_time_logger.end_timer() #return functions return variables, prediction, loss, accuracy, auc
def build_cnn_model(args): build_cnn_model_time_logger = logger(int(args['task_index']), "Build CNN Model") build_cnn_model_time_logger.start_timer() #datatype dtype = args["precision"] #find out which device to use: device = '/cpu:0' if args['arch'] == 'gpu': device = '/gpu:0' #define empty variables dict variables = {} #rotate input shape depending on data format data_format = args['conv_params']['data_format'] input_shape = args['input_shape'] #create graph handle args['graph'] = tf.Graph() #KFAC stuff if args["optimizer"] == "KFAC": args["opt_args"][ "layer_collection"] = tf.contrib.kfac.layer_collection.LayerCollection( ) with args['graph'].as_default(): #create placeholders variables['images_'] = tf.placeholder( dtype, shape=[args['train_batch_size_per_node']] + input_shape) variables['keep_prob_'] = tf.placeholder(dtype) #empty network: network = [] #input layer network.append( tf.reshape(variables['images_'], [-1] + input_shape, name='input')) #get all the conv-args stuff: activation = args['conv_params']['activation'] initializer = args['conv_params']['initializer'] ksize = args['conv_params']['filter_size'] num_filters = args['conv_params']['num_filters'] padding = str(args['conv_params']['padding']) #conv layers: prev_num_filters = args['input_shape'][0] if data_format == "NHWC": prev_num_filters = args['input_shape'][2] for layerid in range(1, args['num_layers'] + 1): #create weight-variable #with tf.device(device): variables['conv' + str(layerid) + '_w'] = tf.Variable( initializer([ksize, ksize, prev_num_filters, num_filters], dtype=dtype), name='conv' + str(layerid) + '_w', dtype=dtype) prev_num_filters = num_filters #conv unit network.append( tf.nn.conv2d(network[-1], filter=variables['conv' + str(layerid) + '_w'], strides=[1, 1, 1, 1], padding=padding, data_format=data_format, name='conv' + str(layerid))) #batchnorm if desired outshape = network[-1].shape[1:] if args['batch_norm']: #add batchnorm #with tf.device(device): #mu variables['bn' + str(layerid) + '_m'] = tf.Variable( tf.zeros(outshape, dtype=dtype), name='bn' + str(layerid) + '_m', dtype=dtype) #sigma variables['bn' + str(layerid) + '_s'] = tf.Variable( tf.ones(outshape, dtype=dtype), name='bn' + str(layerid) + '_s', dtype=dtype) #gamma variables['bn' + str(layerid) + '_g'] = tf.Variable( tf.ones(outshape, dtype=dtype), name='bn' + str(layerid) + '_g', dtype=dtype) #beta variables['bn' + str(layerid) + '_b'] = tf.Variable( tf.zeros(outshape, dtype=dtype), name='bn' + str(layerid) + '_b', dtype=dtype) #add batch norm layer network.append( tf.nn.batch_normalization( network[-1], mean=variables['bn' + str(layerid) + '_m'], variance=variables['bn' + str(layerid) + '_s'], offset=variables['bn' + str(layerid) + '_b'], scale=variables['bn' + str(layerid) + '_g'], variance_epsilon=1.e-4, name='bn' + str(layerid))) else: bshape = (variables['conv' + str(layerid) + '_w'].shape[3]) variables['conv' + str(layerid) + '_b'] = tf.Variable( tf.zeros(bshape, dtype=dtype), name='conv' + str(layerid) + '_b', dtype=dtype) #add bias if dtype != tf.float16: network.append( tf.nn.bias_add(network[-1], variables['conv' + str(layerid) + '_b'], data_format=data_format)) else: print( "Warning: bias-add currently snot supported for fp16!") if args["optimizer"] == "KFAC": args["opt_args"]["layer_collection"].register_conv2d( (variables['conv' + str(layerid) + '_w'], variables['conv' + str(layerid) + '_b']), [1, 1, 1, 1], padding, network[-3], network[-1]) #add relu unit #with tf.device(device): network.append(activation(network[-1])) #add maxpool #with tf.device(device): kshape = [1, 1, 2, 2] sshape = [1, 1, 2, 2] if data_format == "NHWC": kshape = [1, 2, 2, 1] sshape = [1, 2, 2, 1] network.append( tf.nn.max_pool(network[-1], ksize=kshape, strides=sshape, padding=args['conv_params']['padding'], data_format=data_format, name='maxpool' + str(layerid))) #add dropout #with tf.device(device): network.append( tf.nn.dropout(network[-1], keep_prob=variables['keep_prob_'], name='drop' + str(layerid))) if args['scaling_improvements']: #add another conv layer with average pooling to the mix #with tf.device(device): variables['conv' + str(layerid + 1) + '_w'] = tf.Variable( initializer([ksize, ksize, prev_num_filters, num_filters], dtype=dtype), name='conv' + str(layerid + 1) + '_w', dtype=dtype) prev_num_filters = num_filters #conv unit network.append( tf.nn.conv2d(network[-1], filter=variables['conv' + str(layerid + 1) + '_w'], strides=[1, 1, 1, 1], padding=padding, data_format=data_format, name='conv' + str(layerid + 1))) #bias bshape = (variables['conv' + str(layerid + 1) + '_w'].shape[3]) variables['conv' + str(layerid + 1) + '_b'] = tf.Variable( tf.zeros(bshape, dtype=dtype), name='conv' + str(layerid + 1) + '_b', dtype=dtype) #add bias if dtype != tf.float16: network.append( tf.nn.bias_add(network[-1], variables['conv' + str(layerid + 1) + '_b'], data_format=data_format)) else: print("Warning: bias-add currently snot supported for fp16!") if args["optimizer"] == "KFAC": args["opt_args"]["layer_collection"].register_conv2d( (variables['conv' + str(layerid + 1) + '_w'], variables['conv' + str(layerid + 1) + '_b']), [1, 1, 1, 1], padding, network[-3], network[-1]) #add relu unit #with tf.device(device): network.append(activation(network[-1])) #add average-pool #with tf.device(device): #pool over everything imsize = network[-1].shape[2] kshape = [1, 1, imsize, imsize] sshape = [1, 1, imsize, imsize] if data_format == "NHWC": kshape = [1, imsize, imsize, 1] sshape = [1, imsize, imsize, 1] network.append( tf.nn.avg_pool(network[-1], ksize=kshape, strides=sshape, padding=args['conv_params']['padding'], data_format=data_format, name='avgpool1')) #reshape outsize = np.prod(network[-1].shape[1:]).value #with tf.device(device): network.append( tf.reshape(network[-1], shape=[-1, outsize], name='flatten')) if not args['scaling_improvements']: #now do the MLP #fc1 #with tf.device(device): variables['fc1_w'] = tf.Variable(initializer( [outsize, args['num_fc_units']], dtype=dtype), name='fc1_w', dtype=dtype) variables['fc1_b'] = tf.Variable(tf.zeros([args['num_fc_units']], dtype=dtype), name='fc1_b', dtype=dtype) network.append( tf.matmul(network[-1], variables['fc1_w']) + variables['fc1_b']) if args["optimizer"] == "KFAC": args["opt_args"]["layer_collection"].register_fully_connected( (variables['fc1_w'], variables['fc1_b']), network[-2], network[-1]) #add relu unit #with tf.device(device): network.append(activation(network[-1])) #add dropout #with tf.device(device): network.append( tf.nn.dropout(network[-1], keep_prob=variables['keep_prob_'], name='drop' + str(layerid))) #fc2 #with tf.device(device): variables['fc2_w'] = tf.Variable(initializer( [args['num_fc_units'], 2], dtype=dtype), name='fc2_w', dtype=dtype) variables['fc2_b'] = tf.Variable(tf.zeros([2], dtype=dtype), name='fc2_b', dtype=dtype) network.append( tf.matmul(network[-1], variables['fc2_w']) + variables['fc2_b']) if args["optimizer"] == "KFAC": args["opt_args"]["layer_collection"].register_fully_connected( (variables['fc2_w'], variables['fc2_b']), network[-2], network[-1]) else: #only one FC layer here #with tf.device(device): variables['fc1_w'] = tf.Variable(initializer([outsize, 2], dtype=dtype), name='fc1_w', dtype=dtype) variables['fc1_b'] = tf.Variable(tf.zeros([2], dtype=dtype), name='fc1_b', dtype=dtype) network.append( tf.matmul(network[-1], variables['fc1_w']) + variables['fc1_b']) if args["optimizer"] == "KFAC": args["opt_args"]["layer_collection"].register_fully_connected( (variables['fc1_w'], variables['fc1_b']), network[-2], network[-1]) #register logits for KFAC: if args["optimizer"] == "KFAC": args["opt_args"][ "layer_collection"].register_categorical_predictive_distribution( network[-1], name="logits") #add softmax #with tf.device(device): network.append(tf.nn.softmax(network[-1])) build_cnn_model_time_logger.end_timer() #return the network and variables return variables, network
def parse_arguments(): parse_arg_logger = logger(-1, "Parse Arguments") parse_arg_logger.start_timer() parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, help="specify a config file in json format") parser.add_argument("--num_tasks", type=int, default=1, help="specify the number of tasks") parser.add_argument( "--precision", type=str, default="fp32", help="specify the precision. supported are fp32 and fp16") parser.add_argument('--dummy_data', action='store_const', const=True, default=False, help='use dummy data instead of real data') parser.add_argument("--disable_training", help="Disable training for test purpose", action='store_true') parser.add_argument("--enable_tf_timeline", help="Enable Timeline module for tracing TF workflow", action='store_true') pargs = parser.parse_args() #load the json: with open(pargs.config, "r") as f: args = json.load(f) #set the rest args['num_tasks'] = pargs.num_tasks args['num_ps'] = 0 args['dummy_data'] = pargs.dummy_data args['disable_training'] = pargs.disable_training args['enable_tf_timeline'] = pargs.enable_tf_timeline #modify the activations if args['conv_params']['activation'] == 'ReLU': args['conv_params']['activation'] = tf.nn.relu else: raise ValueError('Only ReLU is supported as activation') #modify the initializers if args['conv_params']['initializer'] == 'HE': args['conv_params']['initializer'] = tfk.initializers.he_normal() else: raise ValueError('Only ReLU is supported as initializer') #modify the optimizers args['opt_args'] = {"learning_rate": args['learning_rate']} if args['optimizer'] == 'KFAC': args['opt_func'] = tf.contrib.kfac.optimizer.KfacOptimizer args['opt_args']['cov_ema_decay'] = args['cov_ema_decay'] args['opt_args']['damping'] = args['damping'] args['opt_args']['momentum'] = args['momentum'] elif args['optimizer'] == 'ADAM': args['opt_func'] = tf.train.AdamOptimizer else: raise ValueError('Only ADAM and KFAC are supported as optimizer') #now, see if all the paths are there args['logpath'] = args['outputpath'] + '/logs' args['modelpath'] = args['outputpath'] + '/models' if not os.path.isdir(args['logpath']): print("Creating log directory ", args['logpath']) os.makedirs(args['logpath']) if not os.path.isdir(args['modelpath']): print("Creating model directory ", args['modelpath']) os.makedirs(args['modelpath']) if not os.path.isdir(args['inputpath']) and not args['dummy_data']: raise ValueError( "Please specify a valid path with input files in hdf5 format") #precision: args['precision'] = tf.float32 if pargs.precision == "fp16": args['precision'] = tf.float16 parse_arg_logger.end_timer() return args
def _h5_input_subprocess_reader(path, channels, weights, minvals, maxvals, update_on_read, dtype, comm_rank=-1): #begin_time = time.time() # need to send the comm_rank here image_reader_timer_logger = logger(comm_rank, "Time to Read Single Image", -1, True) image_reader_timer_logger.start_timer() #Edited on July 16 2018, by Jialin Liu #Replacing following lines till 158 with fake I/O, to avoid I/O calls to file systems # dsize = [4, 768, 1152] # Hard coded size for climate->data data = np.random.rand(len(channels), dsize[1], dsize[2]) #data = f['climate']['data'][channels,:,:] # cast data if needed if data.dtype != dtype: #data = f['climate']['data'][channels,:,:].astype(dtype) data = data.astype(dtype) #do min/max normalization for c in range(len(channels)): data[c, :, :] = (data[c, :, :] - minvals[c]) / (maxvals[c] - minvals[c]) # get label dataet shape # label_shape = f['climate']['labels'].shape label_shape = [768, 1152] # Hard coded size of climate->labels # generate same size of label data in memory label = np.random.rand(label_shape[0], label_shape[1]) #get label #label = f['climate']['labels'][...] if label.dtype != np.int32: label = label.astype(np.int32) # with h5.File(path, "r", driver="core", backing_store=False, libver="latest") as f: # #get min and max values and update stored values # if update_on_read: # minvals = np.minimum(minvals, f['climate']['stats'][channels,0]) # maxvals = np.maximum(maxvals, f['climate']['stats'][channels,1]) # #get data # if 'channels' in f['climate']: # # some channels have been dropped from the file, so map to the # # actual locations in the file array # channel_list = list(f['climate']['channels']) # channels = [ channel_list.index(c) for c in channels ] # data = f['climate']['data'][channels,:,:] # # cast data if needed # if data.dtype != dtype: # #data = f['climate']['data'][channels,:,:].astype(dtype) # data = data.astype(dtype) # #do min/max normalization # for c in range(len(channels)): # data[c,:,:] = (data[c,:,:]-minvals[c])/(maxvals[c]-minvals[c]) # #get label # label = f['climate']['labels'][...] # if label.dtype != np.int32: # label = label.astype(np.int32) #get weights - choose per-channel based on the labels weights = weights[label] #time #end_time = time.time() #print "Time to read image %.3f s" % (end_time-begin_time) image_reader_timer_logger.end_timer() return data, label, weights, minvals, maxvals
break nvtx.RangePop() # Epoch nvtx.RangePop() # Training Loop training_loop_timer_logger.end_timer() if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') io_training_time_logger.end_timer() global_time_logger.end_timer() if __name__ == '__main__': argparse_timer_logger = logger(-1, "Parse Arguments", -1, True) argparse_timer_logger.start_timer() AP = argparse.ArgumentParser() AP.add_argument("--lr", default=1e-4, type=float, help="Learning rate") AP.add_argument("--blocks", default=[3, 3, 4, 4, 7, 7, 10], type=int, nargs="*", help="Number of layers per block") AP.add_argument("--output", type=str, default='output', help="Defines the location and name of output directory") AP.add_argument( "--chkpt",
def train_loop(sess, train_step, global_step, optlist, args, trainset, validationset, disable_training, enable_tf_timeline): train_loop_logger = logger(int(args["task_index"]), "Train Loop") train_loop_logger.start_timer() options = None run_metadata = None many_runs_timeline = None if enable_tf_timeline: options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() many_runs_timeline = timeliner() #counter stuff trainset.reset() validationset.reset() #restore weights belonging to graph epochs_completed = 0 if not args['restart']: last_model = tf.train.latest_checkpoint(args['modelpath']) print("Restoring model %s.", last_model) model_saver.restore(sess, last_model) #losses train_loss = 0. train_batches = 0 total_batches = 0 train_time = 0 #do training while not sess.should_stop(): train_iteration_logger = logger(int(args['task_index']), "Training Iteration", epochs_completed) train_iteration_logger.start_timer() #increment total batch counter total_batches += 1 #get next batch images, labels, normweights, _, _ = trainset.next_batch( args['train_batch_size_per_node']) #set weights to zero normweights[:] = 1. #set up feed dict: feed_dict = { variables['images_']: images, variables['labels_']: labels, variables['weights_']: normweights, variables['keep_prob_']: args['dropout_p'] } if not disable_training: #update weights start_time = time.time() if args['create_summary']: _, gstep, summary, tmp_loss = sess.run( [train_step, global_step, train_summary, loss_fn], feed_dict=feed_dict, options=options, run_metadata=run_metadata) if enable_tf_timeline: fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) many_runs_timeline.update_timeline(chrome_trace) else: _, gstep, tmp_loss = sess.run( [train_step, global_step, loss_fn], feed_dict=feed_dict, options=options, run_metadata=run_metadata) if enable_tf_timeline: fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) many_runs_timeline.update_timeline(chrome_trace) #update kfac parameters if optlist: sess.run(optlist[0], feed_dict=feed_dict, options=options, run_metadata=run_metadata) if enable_tf_timeline: fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) many_runs_timeline.update_timeline(chrome_trace) if gstep % args["kfac_inv_update_frequency"] == 0: sess.run(optlist[1], feed_dict=feed_dict, options=options, run_metadata=run_metadata) if enable_tf_timeline: fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( ) many_runs_timeline.update_timeline(chrome_trace) end_time = time.time() train_time += end_time - start_time #increment train loss and batch number train_loss += tmp_loss train_batches += 1 #determine if we give a short update: if gstep % args['display_interval'] == 0: print( time.time(), "REPORT rank", args["task_index"], "global step %d., average training loss %g (%.3f sec/batch)" % (gstep, train_loss / float(train_batches), train_time / float(train_batches))) #check if epoch is done if trainset._epochs_completed > epochs_completed: epochs_completed = trainset._epochs_completed print( time.time(), "COMPLETED rank", args["task_index"], "epoch %d, average training loss %g (%.3f sec/batch)" % (epochs_completed, train_loss / float(train_batches), train_time / float(train_batches))) #reset counters train_loss = 0. train_batches = 0 train_time = 0 #compute validation loss: #reset variables validation_loss = 0. validation_batches = 0 #iterate over batches while True: #get next batch images, labels, normweights, weights, _ = validationset.next_batch( args['validation_batch_size_per_node']) #set weights to 1: normweights[:] = 1. weights[:] = 1. if not disable_training: #compute loss if args['create_summary']: summary, tmp_loss = sess.run( [validation_summary, loss_fn], feed_dict={ variables['images_']: images, variables['labels_']: labels, variables['weights_']: normweights, variables['keep_prob_']: 1.0 }) else: tmp_loss = sess.run( [loss_fn], feed_dict={ variables['images_']: images, variables['labels_']: labels, variables['weights_']: normweights, variables['keep_prob_']: 1.0 }) #add loss validation_loss += tmp_loss[0] validation_batches += 1 #update accuracy sess.run(accuracy_fn[1], feed_dict={ variables['images_']: images, variables['labels_']: labels, variables['weights_']: normweights, variables['keep_prob_']: 1.0 }) #update auc sess.run(auc_fn[1], feed_dict={ variables['images_']: images, variables['labels_']: labels, variables['weights_']: normweights, variables['keep_prob_']: 1.0 }) #check if full pass done if validationset._epochs_completed > 0: validationset.reset() break print( time.time(), "COMPLETED epoch %d, average validation loss %g" % (epochs_completed, validation_loss / float(validation_batches))) validation_accuracy = sess.run(accuracy_fn[0]) print( time.time(), "COMPLETED epoch %d, average validation accu %g" % (epochs_completed, validation_accuracy)) validation_auc = sess.run(auc_fn[0]) print( time.time(), "COMPLETED epoch %d, average validation auc %g" % (epochs_completed, validation_auc)) if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') train_iteration_logger.end_timer() if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') train_loop_logger.end_timer()
def main(input_path, blocks, weights, image_dir, checkpoint_dir, trn_sz, learning_rate, loss_type, fs_type, opt_type, batch, batchnorm, num_epochs, dtype, chkpt, filter_sz, growth, disable_training, enable_tf_timeline): options = None run_metadata = None many_runs_timeline = None timeline_trace_fp = open("timeline_trace.pickle", "wb") options, run_metadata, many_runs_timeline, min_timeline_step, max_timeline_step = \ init_timeline_configs(enable_tf_timeline, tf.RunOptions.FULL_TRACE, -1, -1) global_time_logger = logger(-1, "Global Total Time", -1, True) global_time_logger.start_timer() #init horovod initialization_timer_logger = logger(-1, "Initialize Horovod", -1, True) initialization_timer_logger.start_timer() nvtx.RangePush("init horovod", 1) comm_rank = 0 comm_local_rank = 0 comm_size = 1 comm_local_size = 1 if horovod: hvd.init() comm_rank = hvd.rank() comm_local_rank = hvd.local_rank() comm_size = hvd.size() #not all horovod versions have that implemented try: comm_local_size = hvd.local_size() except: comm_local_size = 1 if comm_rank == 0: print("Using distributed computation with Horovod: {} total ranks". format(comm_size, comm_rank)) nvtx.RangePop() # init horovod initialization_timer_logger.set_rank(int(comm_rank)) initialization_timer_logger.end_timer() global_time_logger.set_rank(int(comm_rank)) #parameters channels = [0, 1, 2, 10] per_rank_output = False loss_print_interval = 1 #session config initialization_timer_logger.start_timer(comm_rank, "Configure Session") sess_config = tf.ConfigProto( inter_op_parallelism_threads=6, #1 intra_op_parallelism_threads=1, #6 log_device_placement=False, allow_soft_placement=True) sess_config.gpu_options.visible_device_list = str(comm_local_rank) initialization_timer_logger.end_timer() #get data initialization_timer_logger.start_timer(comm_rank, "Get Data") training_graph = tf.Graph() if comm_rank == 0: print("Loading data...") trn_data, val_data, tst_data = load_data(input_path, trn_sz, comm_rank) if comm_rank == 0: print("Shape of trn_data is {}".format(trn_data.shape[0])) print("done.") initialization_timer_logger.end_timer() #print some stats if comm_rank == 0: print("Learning Rate: {}".format(learning_rate)) print("Num workers: {}".format(comm_size)) print("Local batch size: {}".format(batch)) if dtype == tf.float32: print("Precision: {}".format("FP32")) else: print("Precision: {}".format("FP16")) print("Batch normalization: {}".format(batchnorm)) print("Blocks: {}".format(blocks)) print("Growth rate: {}".format(growth)) print("Filter size: {}".format(filter_sz)) print("Channels: {}".format(channels)) print("Loss type: {}".format(loss_type)) print("Loss weights: {}".format(weights)) print("Optimizer type: {}".format(opt_type)) print("Num training samples: {}".format(trn_data.shape[0])) print("Num validation samples: {}".format(val_data.shape[0])) io_training_time_logger = logger(comm_rank, "IO and Training", -1, True) io_training_time_logger.start_timer() with training_graph.as_default(): nvtx.RangePush("TF Init", 3) #create readers trn_reader = h5_input_reader(input_path, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, comm_rank=comm_rank) val_reader = h5_input_reader(input_path, channels, weights, dtype, normalization_file="stats.h5", update_on_read=False, comm_rank=comm_rank) #create datasets if fs_type == "local": trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_local_size, comm_local_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_local_size, comm_local_rank, dtype, shuffle=False) else: trn_dataset = create_dataset(trn_reader, trn_data, batch, num_epochs, comm_size, comm_rank, dtype, shuffle=True) val_dataset = create_dataset(val_reader, val_data, batch, 1, comm_size, comm_rank, dtype, shuffle=False) #create iterators handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder") iterator = tf.data.Iterator.from_string_handle( handle, (dtype, tf.int32, dtype), ((batch, len(channels), image_height, image_width), (batch, image_height, image_width), (batch, image_height, image_width))) next_elem = iterator.get_next() #create init handles #trn trn_iterator = trn_dataset.make_initializable_iterator() trn_handle_string = trn_iterator.string_handle() trn_init_op = iterator.make_initializer(trn_dataset) #val val_iterator = val_dataset.make_initializable_iterator() val_handle_string = val_iterator.string_handle() val_init_op = iterator.make_initializer(val_dataset) #set up model logit, prediction = create_tiramisu(3, next_elem[0], image_height, image_width, len(channels), loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype, batchnorm=batchnorm, growth_rate=growth, filter_sz=filter_sz, comm_rank=comm_rank) #set up loss labels_one_hot = tf.cast(tf.contrib.layers.one_hot_encoding( next_elem[1], 3), dtype=dtype) loss = None if loss_type == "weighted": loss = tf.losses.softmax_cross_entropy( onehot_labels=labels_one_hot, logits=logit, weights=next_elem[2]) elif loss_type == "focal": loss = focal_loss(onehot_labels=labels_one_hot, logits=logit, alpha=1., gamma=2.) else: raise ValueError("Error, loss type {} not supported.", format(loss_type)) if horovod: loss_avg = hvd.allreduce(tf.cast(loss, tf.float32)) else: loss_avg = tf.identity(loss) #set up global step global_step = tf.train.get_or_create_global_step() #set up optimizer if opt_type.startswith("LARC"): if comm_rank == 0: print("Enabling LARC") train_op = get_larc_optimizer(opt_type.split("-")[1], loss, global_step, learning_rate, LARC_mode="clip", LARC_eta=0.002, LARC_epsilon=1. / 16000.) else: train_op = get_optimizer(opt_type, loss, global_step, learning_rate) #set up streaming metrics iou_op, iou_update_op = tf.metrics.mean_iou(labels=next_elem[1], predictions=tf.argmax( prediction, axis=3), num_classes=3, weights=None, metrics_collections=None, updates_collections=None, name="iou_score") iou_reset_op = tf.variables_initializer([ i for i in tf.local_variables() if i.name.startswith('iou_score/') ]) if horovod: iou_avg = hvd.allreduce(iou_op) else: iou_avg = tf.identity(iou_op) #compute epochs and stuff: if fs_type == "local": num_samples = trn_data.shape[0] // comm_local_size else: num_samples = trn_data.shape[0] // comm_size #num_steps_per_epoch = num_samples // batch num_steps_per_epoch = 10 num_steps = num_epochs * num_steps_per_epoch if per_rank_output: print("Rank {} does {} steps per epoch".format( comm_rank, num_steps_per_epoch)) #hooks #these hooks are essential. regularize the step hook by adding one additional step at the end hooks = [tf.train.StopAtStepHook(last_step=num_steps + 1)] #bcast init for bcasting the model after start init_bcast = hvd.broadcast_global_variables(0) #initializers: init_op = tf.global_variables_initializer() init_local_op = tf.local_variables_initializer() #checkpointing if comm_rank == 0: checkpoint_save_freq = num_steps_per_epoch * 2 checkpoint_saver = tf.train.Saver(max_to_keep=1000) listener = checkpoint_listener(comm_rank, True) hooks.append( tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver, listeners=[listener])) #create image dir if not exists if not os.path.isdir(image_dir): os.makedirs(image_dir) ##DEBUG ##summary #if comm_rank == 0: # print("write graph for debugging") # tf.summary.scalar("loss",loss) # summary_op = tf.summary.merge_all() # #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op)) # with tf.Session(config=sess_config) as sess: # sess.run([init_op, init_local_op]) # #create iterator handles # trn_handle = sess.run(trn_handle_string) # #init iterators # sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels}) # #summary: # sess.run(summary_op, feed_dict={handle: trn_handle}) # #summary file writer # summary_writer = tf.summary.FileWriter('./logs', sess.graph) ##DEBUG #start session with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess: #initialize sess.run([init_op, init_local_op]) #restore from checkpoint: if comm_rank == 0: load_model(sess, checkpoint_saver, checkpoint_dir, comm_rank) #broadcast loaded model variables sess.run(init_bcast) #create iterator handles trn_handle, val_handle = sess.run( [trn_handle_string, val_handle_string], options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "create_iterator_handle.json") #init iterators sess.run(trn_init_op, feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "init_train_iterator_handle.json") sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range(enable_tf_timeline, run_metadata, many_runs_timeline, "init_val_iterator_handle.json") nvtx.RangePop() # TF Init # do the training epoch = 1 step = 1 train_loss = 0. nvtx.RangePush("Training Loop", 4) nvtx.RangePush("Epoch", epoch) start_time = time.time() training_loop_timer_logger = logger(comm_rank, "Training Loop", -1, True) training_loop_timer_logger.start_timer() train_steps = 0 while not (sess.should_stop()): #training loop try: training_iteration_time_logger = logger( comm_rank, "Training Iteration", epoch, True) training_iteration_time_logger.start_timer() nvtx.RangePush("Step", step) if disable_training: train_steps = sess.run([global_step], feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, train_steps[0], "train_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) train_steps_in_epoch = train_steps[ 0] % num_steps_per_epoch # do the validation phase if train_steps_in_epoch == 0: eval_steps = 0 while True: try: sess.run([next_elem[1]], feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "val_dict" + str(eval_steps) + ".json") eval_steps += 1 except tf.errors.OutOfRangeError: sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "val_dict_out_" + str(eval_steps) + ".json") break else: # construct feed dict _, train_steps, tmp_loss = sess.run( [ train_op, global_step, (loss if per_rank_output else loss_avg) ], feed_dict={handle: trn_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, train_steps, "val_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) if comm_rank == 0: step_trace_fp = open( "train_step_trace_" + str(global_step) + ".pickle", "wb") pickle.dump(run_metadata, step_trace_fp) train_steps_in_epoch = train_steps % num_steps_per_epoch train_loss += tmp_loss nvtx.RangePop() # Step step += 1 #print step report eff_steps = train_steps_in_epoch if ( train_steps_in_epoch > 0) else num_steps_per_epoch if (train_steps % loss_print_interval) == 0: if per_rank_output: print( "REPORT: rank {}, training loss for step {} (of {}) is {}, time {}" .format(comm_rank, train_steps, num_steps, train_loss / eff_steps, time.time() - start_time)) else: if comm_rank == 0: print( "REPORT: training loss for step {} (of {}) is {}, time {}" .format(train_steps, num_steps, train_loss / eff_steps, time.time() - start_time)) #do the validation phase if train_steps_in_epoch == 0: end_time = time.time() #print epoch report train_loss /= num_steps_per_epoch if per_rank_output: print( "COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, time {} s" .format(comm_rank, epoch, num_epochs, train_loss, time.time() - start_time)) else: if comm_rank == 0: print( "COMPLETED: training loss for epoch {} (of {}) is {}, time {} s" .format(epoch, num_epochs, train_loss, time.time() - start_time)) #evaluation loop eval_loss = 0. eval_steps = 0 nvtx.RangePush("Eval Loop", 7) timeline_help_count = 0 while True: try: #construct feed dict _, tmp_loss, val_model_predictions, val_model_labels = sess.run( [ iou_update_op, (loss if per_rank_output else loss_avg), prediction, next_elem[1] ], feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, timeline_help_count, "train_" + str(global_step) + ".json", min_timeline_step, max_timeline_step) if comm_rank == 0: step_trace_fp = open( "validation_step_trace_" + str(global_step) + ".pickle", "wb") pickle.dump(run_metadata, step_trace_fp) timeline_help_count += 1 #print some images if comm_rank == 0: if have_imsave: imsave( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', np.argmax( val_model_predictions[0, ...], axis=2) * 100) imsave( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', val_model_labels[0, ...] * 100) imsave( image_dir + '/test_combined_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.png', colormap[ val_model_labels[0, ...], np.argmax( val_model_predictions[ 0, ...], axis=2)]) else: np.save( image_dir + '/test_pred_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npy', np.argmax( val_model_predictions[0, ...], axis=2) * 100) np.save( image_dir + '/test_label_epoch' + str(epoch) + '_estep' + str(eval_steps) + '_rank' + str(comm_rank) + '.npy', val_model_labels[0, ...] * 100) eval_loss += tmp_loss eval_steps += 1 except tf.errors.OutOfRangeError: eval_steps = np.max([eval_steps, 1]) eval_loss /= eval_steps if per_rank_output: print( "COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, eval_loss)) else: if comm_rank == 0: print( "COMPLETED: evaluation loss for epoch {} (of {}) is {}" .format( epoch, num_epochs, eval_loss)) if per_rank_output: iou_score = sess.run(iou_op) print( "COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}" .format(comm_rank, epoch, num_epochs, iou_score)) else: iou_score = sess.run(iou_avg) if comm_rank == 0: print( "COMPLETED: evaluation IoU for epoch {} (of {}) is {}" .format( epoch, num_epochs, iou_score)) sess.run(iou_reset_op) sess.run(val_init_op, feed_dict={handle: val_handle}, options=options, run_metadata=run_metadata) update_timeline_in_range( enable_tf_timeline, run_metadata, many_runs_timeline, "train_" + str(global_step) + ".json") if comm_rank == 0: step_trace_fp = open( "validation_step_trace_out.pickle", "wb") pickle.dump(run_metadata, step_trace_fp) break nvtx.RangePop() # Eval Loop if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') # reset counters epoch += 1 train_loss = 0. step = 0 nvtx.RangePop() # Epoch nvtx.RangePush("Epoch", epoch) training_iteration_time_logger.end_timer() except tf.errors.OutOfRangeError: break nvtx.RangePop() # Epoch nvtx.RangePop() # Training Loop training_loop_timer_logger.end_timer() if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') io_training_time_logger.end_timer() global_time_logger.end_timer()
def create_tiramisu(nb_classes, img_input, height, width, nc, loss_weights, nb_dense_block=6, growth_rate=16, nb_filter=48, nb_layers_per_block=5, p=None, wd=0., training=True, batchnorm=False, dtype=tf.float16, filter_sz=3, comm_rank=-1): create_tiramisu_timer_logger = logger(comm_rank, "Create Tiramisu", -1, True) create_tiramisu_timer_logger.start_timer() if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple: nb_layers = list(nb_layers_per_block) else: nb_layers = [nb_layers_per_block] * nb_dense_block with tf.variable_scope("tiramisu", custom_getter=float32_variable_storage_getter): with tf.variable_scope("conv_input") as scope: x = conv(img_input, nb_filter, sz=filter_sz, wd=wd) if batchnorm: x = tf.layers.batch_normalization(x, axis=1, training=training) x = tf.nn.relu(x) if p: x = tf.layers.dropout(x, rate=p, training=training) with tf.name_scope("down_path") as scope: skips, added = down_path(x, nb_layers, growth_rate, p, wd, training=training, bn=batchnorm, filter_sz=filter_sz) with tf.name_scope("up_path") as scope: x = up_path(added, reverse(skips[:-1]), reverse(nb_layers[:-1]), growth_rate, p, wd, training=training, bn=batchnorm, filter_sz=filter_sz) with tf.name_scope("conv_output") as scope: x = conv(x, nb_classes, sz=1, wd=wd) if p: x = tf.layers.dropout(x, rate=p, training=training) _, f, r, c = x.get_shape().as_list() #x = tf.reshape(x,[-1,nb_classes,image_height,image_width]) #nb_classes was last before x = tf.transpose( x, [0, 2, 3, 1] ) #necessary because sparse softmax cross entropy does softmax over last axis create_tiramisu_timer_logger.end_timer() return x, tf.nn.softmax(x)
def load_next_file(self): load_file_time_logger = logger(self._taskid, "Load File", self._epochs_completed) load_file_time_logger.start_timer() file_access_time_logger = logger(self._taskid, "HDF5 File Read", self._epochs_completed) file_access_time_logger.start_timer() #only load a new file if there are more than one file in the list: if self._num_files > 1 or not self._initialized: try: with h5.File(self._filelist[self._file_index], 'r') as f: #determine total array size: numentries = f['data'].shape[0] if self._split_file: blocksize = int( np.ceil(numentries / float(self._num_tasks))) start = self._taskid * blocksize end = (self._taskid + 1) * blocksize else: start = 0 end = numentries #load the chunk which is needed self._images = f['data'][start:end] self._labels = f['label'][start:end] self._normweights = f['normweight'][start:end] self._weights = f['weight'][start:end] self._psr = f['psr'][start:end] f.close() except EnvironmentError: raise EnvironmentError("Cannot open file " + self._filelist[self._file_index]) file_access_time_logger.end_timer() #sanity checks assert self._images.shape[0] == self._labels.shape[0], ( 'images.shape: %s labels.shape: %s' % (self._images.shape, self_.labels.shape)) assert self._labels.shape[0] == self._normweights.shape[0], ( 'labels.shape: %s normweights.shape: %s' % (self._labels.shape, self._normweights.shape)) assert self._labels.shape[0] == self._psr.shape[0], ( 'labels.shape: %s psr.shape: %s' % (self._labels.shape, self._psr.shape)) self._initialized = True #set number of samples self._num_examples = self._labels.shape[0] #reshape labels and weights self._labels = np.expand_dims(self._labels, axis=1).astype(np.int32, copy=False) self._normweights = np.expand_dims(self._normweights, axis=1) self._weights = np.expand_dims(self._weights, axis=1) self._psr = np.expand_dims(self._psr, axis=1) #transpose images if data format is NHWC if self._data_format == "NHWC": #transform for NCHW to NHWC self._images = np.transpose(self._images, (0, 2, 3, 1)) #create permutation perm = np.arange(self._num_examples) np.random.shuffle(perm) #shuffle self._images = self._images[perm] self._labels = self._labels[perm] self._normweights = self._normweights[perm] self._weights = self._weights[perm] self._psr = self._psr[perm] load_file_time_logger.end_timer()
print( time.time(), "COMPLETED epoch %d, average validation auc %g" % (epochs_completed, validation_auc)) if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') train_iteration_logger.end_timer() if enable_tf_timeline: many_runs_timeline.save('Timeliner_output.json') train_loop_logger.end_timer() global_time_logger = logger(-1, "Global Total Time") global_time_logger.start_timer() # Parse Parameters args = parse_arguments() # Multi-Node Stuff initialization_time_logger = logger(-1, "Server Initialization") initialization_time_logger.start_timer() #decide who will be worker and who will be parameters server if args['num_tasks'] > 1: args['cluster'], args['server'], args['task_index'], args[ 'num_workers'], args['node_type'] = sc.setup_slurm_cluster(