def __init__(self, model: str, weights_path: str = None, target_weights_path: str = None, train: bool = True): super().__init__() self.daemon = True self.terminate = False self.__training = train self.__halt_training = False self.__last_logged_episode = None self.__model = create_model(model, target_weights_path, False) self.__train_model = None self.__target_model = None logger.info("Model created") if not self.__model: logger.error("Unable to create models") sys.exit(1) if train: self.__train_model = create_model(model, weights_path, train) self.__target_model = create_model(model, target_weights_path, False) logger.info("Training models created") if not self.__target_model or not self.__train_model: logger.error("Unable to create training models") sys.exit(1) if weights_path: if not target_weights_path: self.__override_weights() logger.info("Weights synchronized") self.__replay_memory = deque(maxlen=settings.REPLAY_MEMORY_SIZE) self.__target_update_counter = 0 if not os.path.exists(f"logs/{self.__model.name}"): os.makedirs(f"logs/{self.__model.name}") self.tensorboard = TensorBoardCustom( log_dir=f"logs/{self.__model.name}") plot_model(self.__model, f"{self.__model.name}.png", show_shapes=True, expand_nested=True) logger.info("Trainer initialized")
def model_test_mode(args, feeder, hparams, global_step): with tf.variable_scope('VAD_model', reuse=tf.AUTO_REUSE) as scope: model_name = args.model model = create_model(model_name or args.model, hparams) model.initialize(feeder.eval_inputs, feeder.eval_targets, global_step=global_step, is_training=False, is_evaluating=True) model.add_loss() return model
def model_train_mode(args, feeder, hparams, global_step): with tf.variable_scope('VAD_model', reuse=tf.AUTO_REUSE) as scope: model_name = args.model model = create_model(model_name or args.model, hparams) model.initialize(feeder.inputs, feeder.targets, global_step=global_step, is_training=True) model.add_loss() model.add_optimizer(global_step) stats = add_train_stats(model, hparams) return model, stats
def main(): if TRAIN: clf = models.create_model(CLASSIFIER, MODEL_PARAMS) print("Loading data...") images = data.load_images(limit=LIMIT) labels = data.load_labels(limit=LIMIT) print("Params:") box_size = get_box_parameters(labels)[1:3] box_size = box_size[0] - 10, box_size[1] - 10 print(" ", clf.__class__.__name__) print(" ", MODEL_PARAMS) print(" ", LIMIT, "images,", NEG_SIZE, "negatives") print(" box_size:", box_size) print(" ", VECTORIZATION_PARAMS) print("Generating negative set...") negatives = generate_negative_set(images, labels, set_size=NEG_SIZE, save=SAVE_NEGATIVES) all_labels = np.concatenate([labels, negatives]) print("Creating train & validation sets with negatives...") train_labels, valid_labels = data.train_valid_sets(len(images), all_labels, TRAIN_RATE) print("Training...") models.train(clf, images, box_size, train_labels, **VECTORIZATION_PARAMS) if SAVE_MODEL: import pdb; pdb.set_trace() print('Saving alllll') to_save = [clf, images, box_size, train_labels, valid_labels] model_file = open('./temp_model.pickle', 'wb') pickle.dump(to_save, model_file) else: print("Loading all..") model_file = open('./temp.pickle', 'rb') clf, images, box_size, train_labels, valid_labels, predictions = pickle.load(model_file) # print("Predicting and validate on test examples...") # scores, results = models.predict_and_validate(clf, images, box_size, valid_labels, **VECTORIZATION_PARAMS) print("\nPredicting with windows...") valid_indexes = np.unique(valid_labels[:,0]) - 1 predictions = models.predict(clf, images, box_size, **VECTORIZATION_PARAMS, only=valid_indexes) print("\nPredicting with windows and validate...") results = validation.rate_predictions(predictions, valid_labels) print("Test now !") import pdb; pdb.set_trace()
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained') valid_dir = os.path.join(log_dir, 'valid-dir') tensorboard_dir = os.path.join(log_dir, 'events', time_string()) os.makedirs(save_dir, exist_ok=True) os.makedirs(valid_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'model') input_path = os.path.join(args.base_dir, args.training_input) valid_path = os.path.join(args.base_dir, args.validation_input) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) # Start by setting a seed for repeatability tf.random.set_seed(args.random_seed) # To find out which devices your operations and tensors are assigned to tf.debugging.set_log_device_placement(False) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Set up data feeder from datasets.feeder import dataset train_dataset, valid_dataset, train_steps, valid_steps = dataset( input_path, args) # Track the model train_summary_writer = tf.summary.create_file_writer(tensorboard_dir) valid_summary_writer = tf.summary.create_file_writer(tensorboard_dir) # metrics to measure the loss of the model train_loss = tf.keras.metrics.Mean(name='train_loss') train_ler = tf.keras.metrics.Mean(name='train_ler') train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_acc') valid_loss = tf.keras.metrics.Mean(name='valid_loss') valid_ler = tf.keras.metrics.Mean(name='valid_ler') valid_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_acc') # Set up model speech_model = create_model(args.model, save_dir, args) # Optimizer learning_rate = WarmUpSchedule(args.hidden_dim) opt = Adam(learning_rate=learning_rate, beta_1=args.adam_beta1, beta_2=args.adam_beta2, epsilon=args.adam_epsilon) temp_learning_rate = WarmUpSchedule(args.hidden_dim) plt.plot(temp_learning_rate(tf.range(50000, dtype=tf.float32))) plt.ylabel("Learning Rate") plt.xlabel("Train Step") plt.show() checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, las=speech_model.model) manager = tf.train.CheckpointManager(checkpoint, directory=save_dir, max_to_keep=5) checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) checkpoint.restore(manager.latest_checkpoint) else: log('No model to load at {}'.format(save_dir), slack=True) log('Starting new training!', slack=True) eval_best_loss = np.inf summary_list = list() speech_model.model.summary(line_length=180, print_fn=lambda x: summary_list.append(x)) for summary in summary_list: log(summary) # Book keeping patience_count = 0 time_window = ValueWindow(100) train_step_signature = [ tf.TensorSpec(shape=(None, None, args.num_mels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] @tf.function(input_signature=train_step_signature) def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] with tf.GradientTape() as tape: predictions, _, _ = speech_model.model(inp, tar_inp, True) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, speech_model.model.trainable_variables) opt.apply_gradients( zip(gradients, speech_model.model.trainable_variables)) tar_weight = tf.cast(tf.logical_not(tf.math.equal(tar_real, 0)), tf.int32) tar_len = tf.reduce_sum(tar_weight, axis=-1) ler = label_error_rate(tar_real, predictions, tar_len) train_loss(loss) train_ler(ler) train_acc(tar_real, predictions, sample_weight=tar_weight) @tf.function(input_signature=train_step_signature) def train_step_non_teacher(inp, tar): loss = 0. output = tf.expand_dims(tar[:, 0], axis=1) with tf.GradientTape() as tape: for t in range(1, tf.shape(tar)[1]): predictions, _, _ = speech_model.model(inp, output, True) loss += loss_function(tar[:, t], predictions[:, -1, :]) tar_weight = tf.cast( tf.logical_not(tf.math.equal(tar[:, t], 0)), tf.int32) train_acc(tar[:, t], predictions[:, -1, :], sample_weight=tar_weight) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # concatentate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) batch_loss = (loss / tf.cast(tf.shape(tar)[1] - 1, dtype=tf.float32)) gradients = tape.gradient(batch_loss, speech_model.model.trainable_variables) opt.apply_gradients( zip(gradients, speech_model.model.trainable_variables)) tar_len = tf.reduce_sum(tf.cast( tf.logical_not(tf.math.equal(tar[:, 1:], 0)), tf.int32), axis=-1) ler = label_error_rate(tar[:, 1:], predictions, tar_len) train_loss(batch_loss) train_ler(ler) log('Speech Recognition training set to a maximum of {} epochs'.format( args.train_epochs)) # checkpoint.step.assign(1) # Train for epoch in range(args.train_epochs): # show the current epoch number log("[INFO] starting epoch {}/{}...".format(1 + epoch, args.train_epochs)) epochStart = time.time() train_loss.reset_states() train_ler.reset_states() train_acc.reset_states() valid_loss.reset_states() valid_ler.reset_states() valid_acc.reset_states() # loop over the data in batch size increments for (batch, (input, label)) in enumerate(train_dataset): start_time = time.time() # take a step use_teacher_forcing = True if random.random( ) < args.teacher_forcing_ratio else False if use_teacher_forcing: train_step(input, label) # train_step_teacher_forcing(input, label, speech_model, train_loss, train_ler, args, opt) else: train_step_non_teacher(input, label) # train_step_non_teacher_forcing(input, label, speech_model, train_loss, train_ler, args, opt) # book keeping time_window.append(time.time() - start_time) message = '[Epoch {:.3f}] [Step {:7d}] [{:.3f} sec/step, loss={:.5f}, ler={:.5f}, acc={:.5f}]'.format( epoch + (batch / train_steps), int(checkpoint.step), time_window.average, train_loss.result(), train_ler.result(), train_acc.result()) log(message) checkpoint.step.assign_add(1) if train_loss.result() > 1e15 or np.isnan(train_loss.result()): log('Loss exploded to {:.5f} at step {}'.format( train_loss.result(), int(checkpoint.step))) raise Exception('Loss exploded') if int(checkpoint.step) % 1000 == 0: with train_summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=int(checkpoint.step)) tf.summary.scalar('train_ler', train_ler.result(), step=int(checkpoint.step)) tf.summary.scalar('train_acc', train_acc.result(), step=int(checkpoint.step)) if (1 + epoch) % args.eval_interval == 0: # Run eval and save eval stats log('\nRunning evaluation ({} steps) at step {}'.format( valid_steps, int(checkpoint.step))) for (batch, (input, label)) in enumerate(valid_dataset): # take a step valid_logit, align = valid_step_teacher_forcing( input, label, speech_model, valid_loss, valid_ler, valid_acc) if batch % (valid_steps // 10) == 0: decoded = np.argmax(valid_logit, axis=-1) decoded = ''.join([index_token[x] for x in decoded]) original = ''.join( [index_token[x] for x in label.numpy()[0][1:]]) log('Original: %s' % original) log('Decoded: %s' % decoded) plot_alignment( align, os.path.join( valid_dir, 'step-{}-align-{}.png'.format( int(checkpoint.step), batch))) log('Eval loss & ler for global step {}: {:.3f}, {:.3f}'.format( int(checkpoint.step), valid_loss.result(), valid_ler.result())) with valid_summary_writer.as_default(): tf.summary.scalar('valid_loss', valid_loss.result(), step=int(checkpoint.step)) tf.summary.scalar('valid_ler', valid_ler.result(), step=int(checkpoint.step)) tf.summary.scalar('valid_acc', valid_acc.result(), step=int(checkpoint.step)) # Save model and current global step save_path = manager.save() log("Saved checkpoint for step {}: {}".format( int(checkpoint.step), save_path)) if valid_loss.result() < eval_best_loss: # Save model and current global step save_path = manager.save() log("Saved checkpoint for step {}: {}".format( int(checkpoint.step), save_path)) log('Validation loss improved from {:.2f} to {:.2f}'.format( eval_best_loss, valid_loss.result())) eval_best_loss = valid_loss.result() patience_count = 0 else: patience_count += 1 log('Patience: {} times'.format(patience_count)) if patience_count == args.patience: log('Validation loss has not been improved for {} times, early stopping' .format(args.patience)) log('Training complete after {} global steps!'.format( int(checkpoint.step)), slack=True) return save_dir elapsed = (time.time() - epochStart) / 60.0 log("one epoch took {:.4} minutes".format(elapsed)) log('Separation training complete after {} epochs!'.format( args.train_epochs), slack=True) return save_dir
from modules.negative_set import generate_negative_set, get_box_parameters from modules import data, models import numpy as np from config import (PREDICTION_PATH, MODEL_PATH, TRAIN_PATH, LABEL_PATH, CLASSIFIER, MODEL_PARAMS, BOX_SIZE, KW_PARAMS, GRAY, NEG_SIZE) clf = models.create_model(CLASSIFIER, MODEL_PARAMS) print("Loading data...") images = data.load_images(path=TRAIN_PATH, gray=GRAY) labels = data.load_labels(path=LABEL_PATH) print("Generating negative set...") negatives = generate_negative_set(images, labels, set_size=NEG_SIZE) all_labels = np.concatenate([labels, negatives]) print("Training...") models.train(clf, images, BOX_SIZE, all_labels, **KW_PARAMS) print("Saving model...") data.save_model(clf, path=MODEL_PATH) print(f"\nModel trained and saved in {MODEL_PATH} !")
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained') valid_dir = os.path.join(log_dir, 'valid-dir') tensorboard_dir = os.path.join(log_dir, 'events', time_string()) os.makedirs(save_dir, exist_ok=True) os.makedirs(valid_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'model') input_path = os.path.join(args.base_dir, args.training_input) valid_path = os.path.join(args.base_dir, args.validation_input) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) # Start by setting a seed for repeatability tf.random.set_seed(args.random_seed) # To find out which devices your operations and tensors are assigned to tf.debugging.set_log_device_placement(False) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Set up data feeder from datasets.feeder import dataset train_dataset, valid_dataset, train_steps, valid_steps = dataset( input_path, args) # Track the model train_summary_writer = tf.summary.create_file_writer(tensorboard_dir) valid_summary_writer = tf.summary.create_file_writer(tensorboard_dir) # metrics to measure the loss of the model train_loss = tf.keras.metrics.Mean(name='train_loss') valid_loss = tf.keras.metrics.Mean(name='valid_loss') train_ler = tf.keras.metrics.Mean(name='train_ler') valid_ler = tf.keras.metrics.Mean(name='valid_ler') # Set up model speech_model = create_model(args.model, save_dir, args) summary_list = list() speech_model.model.summary(line_length=180, print_fn=lambda x: summary_list.append(x)) for summary in summary_list: log(summary) tf.keras.utils.plot_model(speech_model.model, os.path.join(log_dir, 'model.png'), show_shapes=True) learning_rate = WarmUpSchedule(args.num_units_per_lstm) opt = Adam(learning_rate, beta_1=args.adam_beta1, beta_2=args.adam_beta2, epsilon=args.adam_epsilon) temp_learning_rate = WarmUpSchedule(args.num_units_per_lstm, int(train_steps * 5)) plt.plot(temp_learning_rate(tf.range(50000, dtype=tf.float32))) plt.ylabel("Learning Rate") plt.xlabel("Train Step") plt.show() checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, encoder=speech_model.model) manager = tf.train.CheckpointManager(checkpoint, directory=save_dir, max_to_keep=5) checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) checkpoint.restore(manager.latest_checkpoint) else: log('No model to load at {}'.format(save_dir), slack=True) log('Starting new training!', slack=True) eval_best_loss = np.inf # Book keeping patience_count = 0 time_window = ValueWindow(100) log('Speech Recognition training set to a maximum of {} epochs'.format( args.train_epochs)) def create_lengths(input, label): input_lengths = tf.reduce_sum(tf.cast( tf.logical_not(tf.math.equal(input, 0)), tf.int32), axis=-2) label_lengths = tf.reduce_sum(tf.cast( tf.logical_not(tf.math.equal(label, 0)), tf.int32), axis=-1) return input_lengths[:, 0], label_lengths train_step_signature = [ tf.TensorSpec(shape=(None, None, 80), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] @tf.function(input_signature=train_step_signature) def train_step(input, label): input_len, label_len = create_lengths(input, label) with tf.GradientTape() as tape: logit = speech_model.model(input, training=True) loss = ctc_loss(label, logit, input_len, label_len) grads = tape.gradient(loss, speech_model.model.trainable_variables) if args.clip_gradients: clipped_grads, _ = tf.clip_by_global_norm(grads, args.clip_gradients) else: clipped_grads = grads opt.apply_gradients( zip(clipped_grads, speech_model.model.trainable_variables)) ler = ctc_label_error_rate(label, logit, input_len, label_len) train_loss.update_state(loss) train_ler.update_state(ler) # Train for epoch in range(args.train_epochs): # show the current epoch number log("[INFO] starting epoch {}/{}...".format(1 + epoch, args.train_epochs)) epochStart = time.time() train_loss.reset_states() train_ler.reset_states() valid_loss.reset_states() valid_ler.reset_states() # loop over the data in batch size increments for (batch, (input, label)) in enumerate(train_dataset): start_time = time.time() # take a step train_step(input, label) # book keeping time_window.append(time.time() - start_time) message = '[Epoch {:3d}] [Step {:7d}] [{:.3f} sec/step, loss={:.5f}, ler={:.5f}]'.format( epoch + (batch / train_steps), int(checkpoint.step), time_window.average, train_loss.result(), train_ler.result()) log(message) checkpoint.step.assign_add(1) if train_loss.result() > 1e15 or np.isnan(train_loss.result()): log('Loss exploded to {:.5f} at step {}'.format( train_loss.result(), int(checkpoint.step))) raise Exception('Loss exploded') if int(checkpoint.step) % 1000 == 0: with train_summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=int(checkpoint.step)) tf.summary.scalar('train_ler', train_ler.result(), step=int(checkpoint.step)) if (1 + epoch) % args.eval_interval == 0: # Run eval and save eval stats log('\nRunning evaluation at epoch {}'.format(epoch)) for (batch, (input, label)) in enumerate(valid_dataset): input_len, label_len = create_lengths(input, label) # take a step valid_logit = valid_step(input, label, input_len, label_len, speech_model, valid_loss, valid_ler) if batch % (valid_steps // 10) == 0: decoded = greedy_decode( tf.expand_dims(valid_logit[0], axis=0), tf.expand_dims(input_len[0], axis=-1)[tf.newaxis, ...]) decoded = ''.join([index_token[x] for x in decoded]) original = ''.join( [index_token[x] for x in label.numpy()[0]]) log('Original: %s' % original) log('Decoded: %s' % decoded) log('Eval loss & ler for global step {}: {:.3f}, {:.3f}'.format( int(checkpoint.step), valid_loss.result(), valid_ler.result())) with valid_summary_writer.as_default(): tf.summary.scalar('valid_loss', valid_loss.result(), step=int(checkpoint.step)) tf.summary.scalar('valid_ler', valid_ler.result(), step=int(checkpoint.step)) # Save model and current global step save_path = manager.save() log("Saved checkpoint for step {}: {}".format( int(checkpoint.step), save_path)) if valid_loss.result() < eval_best_loss: # Save model and current global step save_path = manager.save() log("Saved checkpoint for step {}: {}".format( int(checkpoint.step), save_path)) log('Validation loss improved from {:.2f} to {:.2f}'.format( eval_best_loss, valid_loss.result())) eval_best_loss = valid_loss.result() patience_count = 0 else: patience_count += 1 log('Patience: {} times'.format(patience_count)) if patience_count == args.patience: log('Validation loss has not been improved for {} times, early stopping' .format(args.patience)) log('Training complete after {} global steps!'.format( int(checkpoint.step)), slack=True) return save_dir elapsed = (time.time() - epochStart) / 60.0 log("one epoch took {:.4} minutes".format(elapsed)) log('Separation training complete after {} epochs!'.format( args.train_epochs), slack=True) return save_dir