def invoke_dataset_from_config(config: Config, required: Union[str, list, tuple] = None): """ Initializes datasets from config. Imports specified data reader and instantiates it with parameters from config. :param config: config :param required: string, list or tuple specifying which datasets have to be loaded (e.g. ["train", "val"]) :return: initialized data readers """ # Initialize Data Reader if specified readers = {} if config.has_value("dataset"): def to_list(value): if value is None: result = [] elif isinstance(value, str): result = list([value]) else: result = list(value) return result dataset = config.dataset required = to_list(required) try: reader_class = import_object(dataset["reader"]) reader_args = inspect.signature(reader_class).parameters.keys() datasets = [key for key in dataset.keys() if key not in reader_args and key != "reader"] global_args = [key for key in dataset.keys() if key not in datasets and key != "reader"] # check for required datasets before loading anything if required is not None: required = to_list(required) missing = [d for d in required if d not in datasets] if len(missing) > 0: raise Exception("Missing required dataset(s) {}".format(missing)) # read "global" parameters global_pars = {} for key in global_args: value = dataset[key] global_pars[key] = value if isinstance(value, str) and "import::" in value: global_pars[key] = import_object(value[len("import::"):]) if key == "transforms": global_pars[key] = Compose([invoke_functional_with_params(t) for t in value]) # read dataset specific parameters for dset in datasets: # inspect parameters and resolve if necessary for key, value in dataset[dset].items(): if isinstance(value, str) and "import::" in value: dataset[dset][key] = import_object(value[len("import::"):]) if key == "transforms": dataset[dset][key] = Compose([invoke_functional_with_params(t) for t in value]) print("Loading dataset '{}'...".format(dset)) readers[dset] = reader_class(**{**global_pars, **dataset[dset]}) except (AttributeError, TypeError) as e: print("Unable to import '{}'".format(e)) raise e return readers
def main(_): config = Config() np.random.seed(config.get_value("random_seed", 12345)) # PARAMETERS n_epochs = config.get_value("epochs", 100) batchsize = config.get_value("batchsize", 8) n_classes = config.get_value("n_classes", 13) dropout = config.get_value("dropout", 0.25) # TODO num_threads = config.get_value("num_threads", 5) initial_val = config.get_value("initial_val", True) # READER, LOADER readers = invoke_dataset_from_config(config) reader_train = readers["train"] reader_val = readers["val"] train_loader = torch.utils.data.DataLoader(reader_train, batch_size=config.batchsize, shuffle=True, num_workers=num_threads) val_loader = torch.utils.data.DataLoader(reader_val, batch_size=1, shuffle=False, num_workers=num_threads) # CONFIG tell = TeLLSession(config=config, model_params={"shape": reader_train.shape}) # Get some members from the session for easier usage session = tell.tf_session model = tell.model workspace, config = tell.workspace, tell.config prediction = tf.sigmoid(model.output) prediction_val = tf.reduce_mean(tf.sigmoid(model.output), axis=0, keepdims=True) # LOSS if hasattr(model, "loss"): loss = model.loss() else: with tf.name_scope("Loss_per_Class"): loss = 0 for i in range(n_classes): loss_batch = tf.nn.sigmoid_cross_entropy_with_logits( logits=model.output[:, i], labels=model.y_[:, i]) loss_mean = tf.reduce_mean(loss_batch) loss += loss_mean # Validation loss after patching if hasattr(model, "loss"): loss_val = model.loss() else: with tf.name_scope("Loss_per_Class_Patching"): loss_val = 0 for i in range(n_classes): loss_batch = tf.nn.sigmoid_cross_entropy_with_logits( logits=tf.reduce_mean(model.output[:, i], axis=0, keepdims=True), labels=model.y_[:, i]) loss_mean = tf.reduce_mean(loss_batch) loss_val += loss_mean # REGULARIZATION reg_penalty = regularize(layers=model.layers, l1=config.l1, l2=config.l2, regularize_weights=True, regularize_biases=True) # LEARNING RATE (SCHEDULE) # if a LRS is defined always use MomentumOptimizer and pass learning rate to optimizer lrs_plateu = False if config.get_value("lrs", None) is not None: lr_sched_type = config.lrs["type"] if lr_sched_type == "plateau": lrs_plateu = True learning_rate = tf.placeholder(tf.float32, [], name='learning_rate') lrs_learning_rate = config.get_value( "optimizer_params")["learning_rate"] lrs_n_bad_epochs = 0 # counter for plateu LRS lrs_patience = config.lrs["patience"] lrs_factor = config.lrs["factor"] lrs_threshold = config.lrs["threshold"] lrs_mode = config.lrs["mode"] lrs_best = -np.inf if lrs_mode == "max" else np.inf lrs_is_better = lambda old, new: (new > old * ( 1 + lrs_threshold)) if lrs_mode == "max" else (new < old * ( 1 - lrs_threshold)) else: learning_rate = None # if no LRS is defined the default optimizer is used with its defined learning rate # LOAD WEIGHTS and get list of trainables if specified assign_loaded_variables = None trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if config.get_value("checkpoint", None) is not None: with Timer(name="Loading Checkpoint", verbose=True): assign_loaded_variables, trainables = tell.load_weights( config.get_value("checkpoint", None), config.get_value("freeze", False), config.get_value("exclude_weights", None), config.get_value("exclude_freeze", None)) # Update step if len(trainables) > 0: update, gradients, gradient_name_dict = update_step( loss + reg_penalty, config, tell, lr=learning_rate, trainables=trainables) # INITIALIZE Tensorflow VARIABLES step = tell.initialize_tf_variables().global_step # ASSING LOADED WEIGHTS (overriding initializations) if available if assign_loaded_variables is not None: session.run(assign_loaded_variables) # ------------------------------------------------------------------------- # Start training # ------------------------------------------------------------------------- try: n_mbs = len(train_loader) epoch = int((step * batchsize) / (n_mbs * batchsize)) epochs = range(epoch, n_epochs) if len(trainables) == 0: validate(val_loader, n_classes, session, loss_val, prediction_val, model, workspace, step, batchsize, tell) return print("Epoch: {}/{} (step: {}, nmbs: {}, batchsize: {})".format( epoch + 1, n_epochs, step, n_mbs, batchsize)) for ep in epochs: if ep == 0 and initial_val: f1 = validate(val_loader, n_classes, session, loss_val, prediction_val, model, workspace, step, batchsize, tell) else: if config.has_value("lrs_best") and config.has_value( "lrs_learning_rate") and config.has_value( "lrs_n_bad_epochs"): f1 = config.get_value("lrs_f1") lrs_best = config.get_value("lrs_best") lrs_learning_rate = config.get_value("lrs_learning_rate") lrs_n_bad_epochs = config.get_value("lrs_n_bad_epochs") else: f1 = 0 # LRS "Plateu" if lrs_plateu: # update scheduler if lrs_is_better(lrs_best, f1): lrs_best = f1 lrs_n_bad_epochs = 0 else: lrs_n_bad_epochs += 1 # update learning rate if lrs_n_bad_epochs > lrs_patience: lrs_learning_rate = max(lrs_learning_rate * lrs_factor, 0) lrs_n_bad_epochs = 0 with tqdm(total=len(train_loader), desc="Training [{}/{}]".format(ep + 1, len(epochs))) as pbar: for mbi, mb in enumerate(train_loader): # LRS "Plateu" if lrs_plateu: feed_dict = { model.X: mb['input'].numpy(), model.y_: mb['target'].numpy(), model.dropout: dropout, learning_rate: lrs_learning_rate } else: feed_dict = { model.X: mb['input'].numpy(), model.y_: mb['target'].numpy(), model.dropout: dropout } # TRAINING pred, loss_train, _ = session.run( [prediction, loss, update], feed_dict=feed_dict) # Update status pbar.set_description_str( "Training [{}/{}] Loss: {:.4f}".format( ep + 1, len(epochs), loss_train)) pbar.update() step += 1 validate(val_loader, n_classes, session, loss_val, prediction_val, model, workspace, step, batchsize, tell) except AbortRun: print("Aborting...") finally: tell.close(global_step=step, save_checkpoint=True)
def __init__(self, config: Config = None, summaries: list = ["training"], model_params=None): """ Take care of initializing a TeLL environment. Creates working directory, instantiates network architecture, configures tensorflow and tensorboard. Furthermore takes care of resuming runs from an existing workspace. :param config: Config config object or None; if None config will be initialized from command line parameter :param summaries: list List of names for summary writers, by default one writer named "training" is opened :param model_params: Optional dictionary of parameters unpacked and passed to model upon initialization if not None :returns: tf_session: Tensorflow session tf_saver: Tensorflow checkpoint saver tf_summaries: dictionary containing tensorflow summary writers, accessible via the names passed upon creation model: TeLL model step: current global step (0 for new runs otherwise step stored in checkpoint file) workspace: TeLL workspace instance config: TeLL config object """ if config is None: config = Config() # Setup working dir workspace = Workspace(config.working_dir, config.specs, config.restore) print("TeLL workspace: {}".format(workspace.working_dir)) # Import configured architecture architecture = config.import_architecture() # Set GPU os.environ["CUDA_VISIBLE_DEVICES"] = str(config.get_value("cuda_gpu", "0")) # Some Tensorflow configuration tf_config = tf.ConfigProto( inter_op_parallelism_threads=config.get_value("inter_op_parallelism_threads", 1), intra_op_parallelism_threads=config.get_value("intra_op_parallelism_threads", 1), log_device_placement=config.get_value("log_device_placement", False) ) tf_config.gpu_options.allow_growth = config.get_value("tf_allow_growth", True) # Start Tensorflow session print("Starting session...") tf_session = tf.Session(config=tf_config) # Set tensorflow random seed set_seed(config.get_value("random_seed", 12345)) # # Init Tensorboard # print("Initializing summaries...") summary_instances = {} for summary in summaries: summary_instances[summary] = tf.summary.FileWriter(os.path.join(workspace.get_tensorboard_dir(), summary), tf_session.graph) # Initialize Model if model_params is None: model = architecture(config=config) else: model = architecture(config=config, **model_params) # Print number of trainable parameters trainable_vars = np.sum([np.prod(t.get_shape()) for t in tf.trainable_variables()]) print("Number of trainable variables: {}".format(trainable_vars)) with tf.name_scope("TeLL") as tell_namescope: # Store global step in checkpoint tf_global_step = tf.Variable(initial_value=tf.constant(0, dtype=tf.int64), name="tell_global_step", dtype=tf.int64, trainable=False) # Define placeholder and operation to dynamically update tf_global_step with a python integer global_step_placeholder = tf.placeholder_with_default(tf_global_step, shape=tf_global_step.get_shape()) set_global_step = tf_global_step.assign(global_step_placeholder) # # Add ops to save and restore all the variables # tf_saver = tf.train.Saver(max_to_keep=config.get_value("max_checkpoints", 10), sharded=False) # Expose members self.tf_session = tf_session self.tf_saver = tf_saver self.tf_summaries = summary_instances if config.has_value("optimizer"): if isinstance(config.optimizer, list): self.tf_optimizer = [getattr(tf.train, config.optimizer[i])(**config.optimizer_params[i]) for i in range(len(config.optimizer))] else: self.tf_optimizer = getattr(tf.train, config.optimizer)(**config.optimizer_params) else: raise Exception( "Missing required parameter 'optimizer' (either specify in your config or on the command line)") self.model = model self.workspace = workspace self.config = config self.global_step = 0 self.__global_step_placeholder = global_step_placeholder self.__global_step_update = set_global_step self.__tell_namescope = tell_namescope