def __init__(self,
                 model: str,
                 weights_path: str = None,
                 target_weights_path: str = None,
                 train: bool = True):
        super().__init__()
        self.daemon = True
        self.terminate = False
        self.__training = train
        self.__halt_training = False

        self.__last_logged_episode = None

        self.__model = create_model(model, target_weights_path, False)
        self.__train_model = None
        self.__target_model = None

        logger.info("Model created")
        if not self.__model:
            logger.error("Unable to create models")
            sys.exit(1)

        if train:
            self.__train_model = create_model(model, weights_path, train)
            self.__target_model = create_model(model, target_weights_path,
                                               False)
            logger.info("Training models created")

            if not self.__target_model or not self.__train_model:
                logger.error("Unable to create training models")
                sys.exit(1)

            if weights_path:
                if not target_weights_path:
                    self.__override_weights()
                    logger.info("Weights synchronized")

            self.__replay_memory = deque(maxlen=settings.REPLAY_MEMORY_SIZE)
            self.__target_update_counter = 0

            if not os.path.exists(f"logs/{self.__model.name}"):
                os.makedirs(f"logs/{self.__model.name}")
            self.tensorboard = TensorBoardCustom(
                log_dir=f"logs/{self.__model.name}")

        plot_model(self.__model,
                   f"{self.__model.name}.png",
                   show_shapes=True,
                   expand_nested=True)
        logger.info("Trainer initialized")
예제 #2
0
def model_test_mode(args, feeder, hparams, global_step):
	with tf.variable_scope('VAD_model', reuse=tf.AUTO_REUSE) as scope:
		model_name = args.model
		model = create_model(model_name or args.model, hparams)
		model.initialize(feeder.eval_inputs, feeder.eval_targets, global_step=global_step, is_training=False, is_evaluating=True)
		model.add_loss()
		return model
예제 #3
0
def model_train_mode(args, feeder, hparams, global_step):
	with tf.variable_scope('VAD_model', reuse=tf.AUTO_REUSE) as scope:
		model_name = args.model
		model = create_model(model_name or args.model, hparams)
		model.initialize(feeder.inputs, feeder.targets, global_step=global_step, is_training=True)
		model.add_loss()
		model.add_optimizer(global_step)
		stats = add_train_stats(model, hparams)
		return model, stats
예제 #4
0
def main():
	if TRAIN:
		clf = models.create_model(CLASSIFIER, MODEL_PARAMS)

		print("Loading data...")
		images = data.load_images(limit=LIMIT)
		labels = data.load_labels(limit=LIMIT)
		
		print("Params:")
		box_size = get_box_parameters(labels)[1:3]
		box_size = box_size[0] - 10, box_size[1] - 10
		print(" ", clf.__class__.__name__)
		print(" ", MODEL_PARAMS)
		print(" ", LIMIT, "images,", NEG_SIZE, "negatives")
		print("  box_size:", box_size)
		print(" ", VECTORIZATION_PARAMS)


		print("Generating negative set...")
		negatives = generate_negative_set(images, labels, set_size=NEG_SIZE, save=SAVE_NEGATIVES)
		all_labels = np.concatenate([labels, negatives])

		print("Creating train & validation sets with negatives...")
		train_labels, valid_labels = data.train_valid_sets(len(images), all_labels, TRAIN_RATE)

		print("Training...")
		models.train(clf, images, box_size, train_labels, **VECTORIZATION_PARAMS)

		if SAVE_MODEL:
			import pdb; pdb.set_trace()
			print('Saving alllll')
			to_save = [clf, images, box_size, train_labels, valid_labels]
			model_file = open('./temp_model.pickle', 'wb')
			pickle.dump(to_save, model_file)

	else:
		print("Loading all..")
		model_file = open('./temp.pickle', 'rb')
		clf, images, box_size, train_labels, valid_labels, predictions = pickle.load(model_file)


	# print("Predicting and validate on test examples...")
	# scores, results = models.predict_and_validate(clf, images, box_size, valid_labels, **VECTORIZATION_PARAMS)

	print("\nPredicting with windows...")
	valid_indexes = np.unique(valid_labels[:,0]) - 1
	predictions = models.predict(clf, images, box_size, **VECTORIZATION_PARAMS, only=valid_indexes)

	print("\nPredicting with windows and validate...")
	results = validation.rate_predictions(predictions, valid_labels)

	print("Test now !")
	import pdb; pdb.set_trace()
예제 #5
0
def train(log_dir, args):
    save_dir = os.path.join(log_dir, 'pretrained')
    valid_dir = os.path.join(log_dir, 'valid-dir')
    tensorboard_dir = os.path.join(log_dir, 'events', time_string())
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(valid_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'model')
    input_path = os.path.join(args.base_dir, args.training_input)
    valid_path = os.path.join(args.base_dir, args.validation_input)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))

    # Start by setting a seed for repeatability
    tf.random.set_seed(args.random_seed)

    # To find out which devices your operations and tensors are assigned to
    tf.debugging.set_log_device_placement(False)

    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    # Set up data feeder
    from datasets.feeder import dataset
    train_dataset, valid_dataset, train_steps, valid_steps = dataset(
        input_path, args)

    # Track the model
    train_summary_writer = tf.summary.create_file_writer(tensorboard_dir)
    valid_summary_writer = tf.summary.create_file_writer(tensorboard_dir)

    # metrics to measure the loss of the model
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_ler = tf.keras.metrics.Mean(name='train_ler')
    train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_acc')
    valid_loss = tf.keras.metrics.Mean(name='valid_loss')
    valid_ler = tf.keras.metrics.Mean(name='valid_ler')
    valid_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_acc')

    # Set up model
    speech_model = create_model(args.model, save_dir, args)

    # Optimizer
    learning_rate = WarmUpSchedule(args.hidden_dim)
    opt = Adam(learning_rate=learning_rate,
               beta_1=args.adam_beta1,
               beta_2=args.adam_beta2,
               epsilon=args.adam_epsilon)

    temp_learning_rate = WarmUpSchedule(args.hidden_dim)
    plt.plot(temp_learning_rate(tf.range(50000, dtype=tf.float32)))
    plt.ylabel("Learning Rate")
    plt.xlabel("Train Step")
    plt.show()

    checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                     optimizer=opt,
                                     las=speech_model.model)
    manager = tf.train.CheckpointManager(checkpoint,
                                         directory=save_dir,
                                         max_to_keep=5)

    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
    if checkpoint_state and checkpoint_state.model_checkpoint_path:
        log('Loading checkpoint {}'.format(
            checkpoint_state.model_checkpoint_path),
            slack=True)
        checkpoint.restore(manager.latest_checkpoint)
    else:
        log('No model to load at {}'.format(save_dir), slack=True)
        log('Starting new training!', slack=True)
    eval_best_loss = np.inf

    summary_list = list()
    speech_model.model.summary(line_length=180,
                               print_fn=lambda x: summary_list.append(x))
    for summary in summary_list:
        log(summary)

    # Book keeping
    patience_count = 0
    time_window = ValueWindow(100)

    train_step_signature = [
        tf.TensorSpec(shape=(None, None, args.num_mels), dtype=tf.float32),
        tf.TensorSpec(shape=(None, None), dtype=tf.int32)
    ]

    @tf.function(input_signature=train_step_signature)
    def train_step(inp, tar):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        with tf.GradientTape() as tape:
            predictions, _, _ = speech_model.model(inp, tar_inp, True)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, speech_model.model.trainable_variables)
        opt.apply_gradients(
            zip(gradients, speech_model.model.trainable_variables))

        tar_weight = tf.cast(tf.logical_not(tf.math.equal(tar_real, 0)),
                             tf.int32)
        tar_len = tf.reduce_sum(tar_weight, axis=-1)
        ler = label_error_rate(tar_real, predictions, tar_len)

        train_loss(loss)
        train_ler(ler)
        train_acc(tar_real, predictions, sample_weight=tar_weight)

    @tf.function(input_signature=train_step_signature)
    def train_step_non_teacher(inp, tar):
        loss = 0.
        output = tf.expand_dims(tar[:, 0], axis=1)
        with tf.GradientTape() as tape:
            for t in range(1, tf.shape(tar)[1]):
                predictions, _, _ = speech_model.model(inp, output, True)

                loss += loss_function(tar[:, t], predictions[:, -1, :])
                tar_weight = tf.cast(
                    tf.logical_not(tf.math.equal(tar[:, t], 0)), tf.int32)
                train_acc(tar[:, t],
                          predictions[:, -1, :],
                          sample_weight=tar_weight)
                # select the last word from the seq_len dimension
                predictions = predictions[:,
                                          -1:, :]  # (batch_size, 1, vocab_size)
                predicted_id = tf.cast(tf.argmax(predictions, axis=-1),
                                       tf.int32)
                # concatentate the predicted_id to the output which is given to the decoder
                # as its input.
                output = tf.concat([output, predicted_id], axis=-1)

        batch_loss = (loss / tf.cast(tf.shape(tar)[1] - 1, dtype=tf.float32))
        gradients = tape.gradient(batch_loss,
                                  speech_model.model.trainable_variables)
        opt.apply_gradients(
            zip(gradients, speech_model.model.trainable_variables))

        tar_len = tf.reduce_sum(tf.cast(
            tf.logical_not(tf.math.equal(tar[:, 1:], 0)), tf.int32),
                                axis=-1)
        ler = label_error_rate(tar[:, 1:], predictions, tar_len)

        train_loss(batch_loss)
        train_ler(ler)

    log('Speech Recognition training set to a maximum of {} epochs'.format(
        args.train_epochs))

    # checkpoint.step.assign(1)
    # Train
    for epoch in range(args.train_epochs):
        # show the current epoch number
        log("[INFO] starting epoch {}/{}...".format(1 + epoch,
                                                    args.train_epochs))
        epochStart = time.time()

        train_loss.reset_states()
        train_ler.reset_states()
        train_acc.reset_states()
        valid_loss.reset_states()
        valid_ler.reset_states()
        valid_acc.reset_states()

        # loop over the data in batch size increments
        for (batch, (input, label)) in enumerate(train_dataset):
            start_time = time.time()
            # take a step
            use_teacher_forcing = True if random.random(
            ) < args.teacher_forcing_ratio else False
            if use_teacher_forcing:
                train_step(input, label)
                # train_step_teacher_forcing(input, label, speech_model, train_loss, train_ler, args, opt)
            else:
                train_step_non_teacher(input, label)
                # train_step_non_teacher_forcing(input, label, speech_model, train_loss, train_ler, args, opt)
            # book keeping
            time_window.append(time.time() - start_time)
            message = '[Epoch {:.3f}] [Step {:7d}] [{:.3f} sec/step, loss={:.5f}, ler={:.5f}, acc={:.5f}]'.format(
                epoch + (batch / train_steps), int(checkpoint.step),
                time_window.average, train_loss.result(), train_ler.result(),
                train_acc.result())

            log(message)
            checkpoint.step.assign_add(1)

            if train_loss.result() > 1e15 or np.isnan(train_loss.result()):
                log('Loss exploded to {:.5f} at step {}'.format(
                    train_loss.result(), int(checkpoint.step)))
                raise Exception('Loss exploded')

            if int(checkpoint.step) % 1000 == 0:
                with train_summary_writer.as_default():
                    tf.summary.scalar('train_loss',
                                      train_loss.result(),
                                      step=int(checkpoint.step))
                    tf.summary.scalar('train_ler',
                                      train_ler.result(),
                                      step=int(checkpoint.step))
                    tf.summary.scalar('train_acc',
                                      train_acc.result(),
                                      step=int(checkpoint.step))

        if (1 + epoch) % args.eval_interval == 0:
            # Run eval and save eval stats
            log('\nRunning evaluation ({} steps) at step {}'.format(
                valid_steps, int(checkpoint.step)))
            for (batch, (input, label)) in enumerate(valid_dataset):
                # take a step
                valid_logit, align = valid_step_teacher_forcing(
                    input, label, speech_model, valid_loss, valid_ler,
                    valid_acc)
                if batch % (valid_steps // 10) == 0:
                    decoded = np.argmax(valid_logit, axis=-1)
                    decoded = ''.join([index_token[x] for x in decoded])
                    original = ''.join(
                        [index_token[x] for x in label.numpy()[0][1:]])
                    log('Original: %s' % original)
                    log('Decoded: %s' % decoded)
                    plot_alignment(
                        align,
                        os.path.join(
                            valid_dir, 'step-{}-align-{}.png'.format(
                                int(checkpoint.step), batch)))

            log('Eval loss & ler for global step {}: {:.3f}, {:.3f}'.format(
                int(checkpoint.step), valid_loss.result(), valid_ler.result()))
            with valid_summary_writer.as_default():
                tf.summary.scalar('valid_loss',
                                  valid_loss.result(),
                                  step=int(checkpoint.step))
                tf.summary.scalar('valid_ler',
                                  valid_ler.result(),
                                  step=int(checkpoint.step))
                tf.summary.scalar('valid_acc',
                                  valid_acc.result(),
                                  step=int(checkpoint.step))

            # Save model and current global step
            save_path = manager.save()
            log("Saved checkpoint for step {}: {}".format(
                int(checkpoint.step), save_path))

            if valid_loss.result() < eval_best_loss:
                # Save model and current global step
                save_path = manager.save()
                log("Saved checkpoint for step {}: {}".format(
                    int(checkpoint.step), save_path))
                log('Validation loss improved from {:.2f} to {:.2f}'.format(
                    eval_best_loss, valid_loss.result()))
                eval_best_loss = valid_loss.result()
                patience_count = 0
            else:
                patience_count += 1
                log('Patience: {} times'.format(patience_count))
                if patience_count == args.patience:
                    log('Validation loss has not been improved for {} times, early stopping'
                        .format(args.patience))
                    log('Training complete after {} global steps!'.format(
                        int(checkpoint.step)),
                        slack=True)
                    return save_dir

            elapsed = (time.time() - epochStart) / 60.0
            log("one epoch took {:.4} minutes".format(elapsed))

    log('Separation training complete after {} epochs!'.format(
        args.train_epochs),
        slack=True)

    return save_dir
예제 #6
0
from modules.negative_set import generate_negative_set, get_box_parameters
from modules import data, models
import numpy as np
from config import (PREDICTION_PATH, MODEL_PATH, TRAIN_PATH, LABEL_PATH,
                    CLASSIFIER, MODEL_PARAMS, BOX_SIZE, KW_PARAMS, GRAY,
                    NEG_SIZE)

clf = models.create_model(CLASSIFIER, MODEL_PARAMS)

print("Loading data...")
images = data.load_images(path=TRAIN_PATH, gray=GRAY)
labels = data.load_labels(path=LABEL_PATH)

print("Generating negative set...")
negatives = generate_negative_set(images, labels, set_size=NEG_SIZE)
all_labels = np.concatenate([labels, negatives])

print("Training...")
models.train(clf, images, BOX_SIZE, all_labels, **KW_PARAMS)

print("Saving model...")
data.save_model(clf, path=MODEL_PATH)

print(f"\nModel trained and saved in {MODEL_PATH} !")
def train(log_dir, args):
    save_dir = os.path.join(log_dir, 'pretrained')
    valid_dir = os.path.join(log_dir, 'valid-dir')
    tensorboard_dir = os.path.join(log_dir, 'events', time_string())
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(valid_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'model')
    input_path = os.path.join(args.base_dir, args.training_input)
    valid_path = os.path.join(args.base_dir, args.validation_input)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))

    # Start by setting a seed for repeatability
    tf.random.set_seed(args.random_seed)

    # To find out which devices your operations and tensors are assigned to
    tf.debugging.set_log_device_placement(False)

    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    # Set up data feeder
    from datasets.feeder import dataset
    train_dataset, valid_dataset, train_steps, valid_steps = dataset(
        input_path, args)

    # Track the model
    train_summary_writer = tf.summary.create_file_writer(tensorboard_dir)
    valid_summary_writer = tf.summary.create_file_writer(tensorboard_dir)

    # metrics to measure the loss of the model
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    valid_loss = tf.keras.metrics.Mean(name='valid_loss')
    train_ler = tf.keras.metrics.Mean(name='train_ler')
    valid_ler = tf.keras.metrics.Mean(name='valid_ler')

    # Set up model
    speech_model = create_model(args.model, save_dir, args)

    summary_list = list()
    speech_model.model.summary(line_length=180,
                               print_fn=lambda x: summary_list.append(x))
    for summary in summary_list:
        log(summary)

    tf.keras.utils.plot_model(speech_model.model,
                              os.path.join(log_dir, 'model.png'),
                              show_shapes=True)

    learning_rate = WarmUpSchedule(args.num_units_per_lstm)
    opt = Adam(learning_rate,
               beta_1=args.adam_beta1,
               beta_2=args.adam_beta2,
               epsilon=args.adam_epsilon)

    temp_learning_rate = WarmUpSchedule(args.num_units_per_lstm,
                                        int(train_steps * 5))
    plt.plot(temp_learning_rate(tf.range(50000, dtype=tf.float32)))
    plt.ylabel("Learning Rate")
    plt.xlabel("Train Step")
    plt.show()

    checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                     optimizer=opt,
                                     encoder=speech_model.model)
    manager = tf.train.CheckpointManager(checkpoint,
                                         directory=save_dir,
                                         max_to_keep=5)

    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
    if checkpoint_state and checkpoint_state.model_checkpoint_path:
        log('Loading checkpoint {}'.format(
            checkpoint_state.model_checkpoint_path),
            slack=True)
        checkpoint.restore(manager.latest_checkpoint)
    else:
        log('No model to load at {}'.format(save_dir), slack=True)
        log('Starting new training!', slack=True)
    eval_best_loss = np.inf

    # Book keeping
    patience_count = 0
    time_window = ValueWindow(100)

    log('Speech Recognition training set to a maximum of {} epochs'.format(
        args.train_epochs))

    def create_lengths(input, label):
        input_lengths = tf.reduce_sum(tf.cast(
            tf.logical_not(tf.math.equal(input, 0)), tf.int32),
                                      axis=-2)
        label_lengths = tf.reduce_sum(tf.cast(
            tf.logical_not(tf.math.equal(label, 0)), tf.int32),
                                      axis=-1)
        return input_lengths[:, 0], label_lengths

    train_step_signature = [
        tf.TensorSpec(shape=(None, None, 80), dtype=tf.float32),
        tf.TensorSpec(shape=(None, None), dtype=tf.int32)
    ]

    @tf.function(input_signature=train_step_signature)
    def train_step(input, label):
        input_len, label_len = create_lengths(input, label)
        with tf.GradientTape() as tape:
            logit = speech_model.model(input, training=True)
            loss = ctc_loss(label, logit, input_len, label_len)
        grads = tape.gradient(loss, speech_model.model.trainable_variables)
        if args.clip_gradients:
            clipped_grads, _ = tf.clip_by_global_norm(grads,
                                                      args.clip_gradients)
        else:
            clipped_grads = grads
        opt.apply_gradients(
            zip(clipped_grads, speech_model.model.trainable_variables))
        ler = ctc_label_error_rate(label, logit, input_len, label_len)
        train_loss.update_state(loss)
        train_ler.update_state(ler)

    # Train
    for epoch in range(args.train_epochs):
        # show the current epoch number
        log("[INFO] starting epoch {}/{}...".format(1 + epoch,
                                                    args.train_epochs))
        epochStart = time.time()

        train_loss.reset_states()
        train_ler.reset_states()
        valid_loss.reset_states()
        valid_ler.reset_states()

        # loop over the data in batch size increments
        for (batch, (input, label)) in enumerate(train_dataset):
            start_time = time.time()
            # take a step
            train_step(input, label)
            # book keeping
            time_window.append(time.time() - start_time)
            message = '[Epoch {:3d}] [Step {:7d}] [{:.3f} sec/step, loss={:.5f}, ler={:.5f}]'.format(
                epoch + (batch / train_steps), int(checkpoint.step),
                time_window.average, train_loss.result(), train_ler.result())

            log(message)
            checkpoint.step.assign_add(1)

            if train_loss.result() > 1e15 or np.isnan(train_loss.result()):
                log('Loss exploded to {:.5f} at step {}'.format(
                    train_loss.result(), int(checkpoint.step)))
                raise Exception('Loss exploded')

            if int(checkpoint.step) % 1000 == 0:
                with train_summary_writer.as_default():
                    tf.summary.scalar('train_loss',
                                      train_loss.result(),
                                      step=int(checkpoint.step))
                    tf.summary.scalar('train_ler',
                                      train_ler.result(),
                                      step=int(checkpoint.step))

        if (1 + epoch) % args.eval_interval == 0:
            # Run eval and save eval stats
            log('\nRunning evaluation at epoch {}'.format(epoch))
            for (batch, (input, label)) in enumerate(valid_dataset):
                input_len, label_len = create_lengths(input, label)
                # take a step
                valid_logit = valid_step(input, label, input_len, label_len,
                                         speech_model, valid_loss, valid_ler)
                if batch % (valid_steps // 10) == 0:
                    decoded = greedy_decode(
                        tf.expand_dims(valid_logit[0], axis=0),
                        tf.expand_dims(input_len[0], axis=-1)[tf.newaxis, ...])
                    decoded = ''.join([index_token[x] for x in decoded])
                    original = ''.join(
                        [index_token[x] for x in label.numpy()[0]])
                    log('Original: %s' % original)
                    log('Decoded: %s' % decoded)

            log('Eval loss & ler for global step {}: {:.3f}, {:.3f}'.format(
                int(checkpoint.step), valid_loss.result(), valid_ler.result()))

            with valid_summary_writer.as_default():
                tf.summary.scalar('valid_loss',
                                  valid_loss.result(),
                                  step=int(checkpoint.step))
                tf.summary.scalar('valid_ler',
                                  valid_ler.result(),
                                  step=int(checkpoint.step))

            # Save model and current global step
            save_path = manager.save()
            log("Saved checkpoint for step {}: {}".format(
                int(checkpoint.step), save_path))

            if valid_loss.result() < eval_best_loss:
                # Save model and current global step
                save_path = manager.save()
                log("Saved checkpoint for step {}: {}".format(
                    int(checkpoint.step), save_path))
                log('Validation loss improved from {:.2f} to {:.2f}'.format(
                    eval_best_loss, valid_loss.result()))
                eval_best_loss = valid_loss.result()
                patience_count = 0
            else:
                patience_count += 1
                log('Patience: {} times'.format(patience_count))
                if patience_count == args.patience:
                    log('Validation loss has not been improved for {} times, early stopping'
                        .format(args.patience))
                    log('Training complete after {} global steps!'.format(
                        int(checkpoint.step)),
                        slack=True)
                    return save_dir

        elapsed = (time.time() - epochStart) / 60.0
        log("one epoch took {:.4} minutes".format(elapsed))

    log('Separation training complete after {} epochs!'.format(
        args.train_epochs),
        slack=True)

    return save_dir