def __init__(self, data_path): """ See base class """ super(MotorSerialize, self).__init__(data_path) create_dir(data_path) if not is_downloaded(data_path): logger.info('Downloading Motor dataset ...') urllib.request.urlretrieve(DATA_URL, get_data_path(data_path))
def __init__(self, model_dir, models, outputs, loss_fn, l1_reg=None, l2_reg=None, clip_gradient=None): self._model_dir = model_dir create_dir(model_dir) self._models = models self._outputs = outputs self._l1_reg = l1_reg self._l2_reg = l2_reg self._clip_grad = clip_gradient # Check model names are unique if len(np.unique([m.get_name() for m in models])) != len(models): raise ValueError('Models provided should have unique names') # Log provided models for m in self._models: logger.info('Provided model "%s"' % m.get_name()) if not isinstance(loss_fn, LossFn): raise ValueError("Loss must be a subclass of LossFn") else: self._loss_fn = loss_fn
def serialize(self, output_folder, train_ratio, val_ratio, num_threads, train_shards, val_shards, test_shards): """ Serializes the data into a Tensorflow recommended Example proto format Args: output_folder: Output folder for the record files. train_ratio: Ratio of instances in the training data. If original dataset already split, this is not used. val_ratio: Ratio of instances in the validation data. num_threads: Threads to use. train_shards: Number of files the training set will be split in. Must be divisible by the number of threads. val_shards: Number of slices the validation set will be split in. Must be divisible by the number of threads. test_shards: Number of slices the testing set will be split in. Must be divisible by the number of threads. """ logger.info("Trying to create dataset into %s" % output_folder) if train_ratio > 1.0 or train_ratio < 0.0: raise ValueError('Training ratio must be in interval [0, 1]') if val_ratio > 1.0 or val_ratio < 0.0: raise ValueError('Validation ratio must be in interval [0, 1]') if train_ratio + val_ratio >= 1.0: raise ValueError('Training and validation ratio exceed 1') if os.path.exists(output_folder): raise ValueError('Dataset already exists!') create_dir(output_folder) # Read dataset self.settings.initialize() # Split according to validation preferences logger.info('Splitting into training and validation') train, val, test = \ self.settings.get_validation_indices(train_ratio, val_ratio) # Create training files self._store_dataset(train, output_folder, train_shards, num_threads, do.DataMode.TRAINING) # Create validation files self._store_dataset(val, output_folder, val_shards, num_threads, do.DataMode.VALIDATION) # Create test files self._store_dataset(test, output_folder, test_shards, num_threads, do.DataMode.TEST) # Store settings self._store_options(output_folder, n_training_instances=len(train), train_ratio=train_ratio, val_ratio=val_ratio) # Free resources, if any self.settings.finalize()
def __init__(self, data_path): """ See base class """ super(AusSerialize, self).__init__(data_path) create_dir(data_path) # On-demand download if it does not exist if not is_downloaded(data_path): logger.info('Downloading Australian dataset ...') download(DATA_URL, get_data_path(data_path))
def __init__(self, data_path): """ See base class """ super(Cifar10Serialize, self).__init__(data_path) create_dir(data_path) if not is_downloaded(data_path): logger.info('Downloading CIFAR dataset ...') download(data_path)
def __init__(self, data_path): """ See base class """ super(SonarSerialize, self).__init__(data_path) create_dir(data_path) # On-demand download if it does not exist if not is_downloaded(data_path): logger.info('Downloading Sonar dataset ...') urllib.request.urlretrieve(DATA_URL, get_data_path(data_path))
def __init__(self, data_path): """ See base class """ super(FashionMnistSerialize, self).__init__(data_path) create_dir(data_path) # On-demand download if it does not exist if not is_downloaded(data_path): logger.info('Downloading Fashion MNIST dataset ...') download(TRAIN_DATA_URL, TRAIN_LABELS_URL, data_path) download(TEST_DATA_URL, TEST_LABELS_URL, data_path)
def serialize_folds(self, output_folder, train_ratio, n_folds, num_threads, test_shards, files_per_fold=1): """ Serializes the data into a Tensorflow recommended Example proto format using N folds. Each fold has its own folder with a certain amount of files. Args: output_folder: Output folder for the record files. train_ratio: Ratio of instances in the training set. The rest will be included in the test set. n_folds: Ratio of instances in the training data. If original dataset already split, this is not used. num_threads: Number of threads to use. test_shards: Number of files to use for testing. files_per_fold: Number of files for each training fold. """ logger.info("Trying to create folded dataset into %s" % output_folder) if os.path.exists(output_folder): raise ValueError('Dataset already exists!') create_dir(output_folder) # Read dataset self.settings.initialize() # Split between training and test train, _, test = \ self.settings.get_validation_indices(train_ratio, 0.0) # Split training into folds idx_per_fold = np.array_split(train, n_folds) for fold in range(n_folds): self._store_dataset(idx_per_fold[fold], output_folder, files_per_fold, num_threads, 'training_fold_%d' % fold) # Save test dataset self._store_dataset(test, output_folder, test_shards, num_threads, do.DataMode.TEST) # Store settings fold_size = int(len(train) / n_folds) self._store_options(output_folder, train_ratio=train_ratio, fold_size=fold_size, n_training_instances=fold_size * (n_folds - 1), n_folds=n_folds) # Free resources, if any self.settings.finalize()
def __init__(self, data_path): """ See base class """ super(QuantumSerialize, self).__init__(data_path) create_dir(data_path) if not is_downloaded(data_path): raise RuntimeError( 'This dataset has been extracted from the KDD Cup 2004. ' + 'In order to process, please proceed to request a copy in ' + 'http://osmot.cs.cornell.edu/kddcup/datasets.html. After ' + 'downloading it, place the extracted content into ' '%s and repeat this operation' % data_path)
def train_and_validate(self, dataset, batch_size, validate_steps, validate_interval, patience=2, steps=None, max_steps=None, track_summaries=1, metrics=[], gpu_frac=1.0, workers=1, mem_dequeing=3, keep_models=True, log_steps=10): """ Trains the network using validation and early stop. Training stops when the number of successive increases of validation loss reaches the patience value. Args: See train_network for other arguments. validate_steps: Number of validation steps. validate_interval: Training steps between validation phases. patience: Maximum successive number of validation loss increase that we want to allow. keep_models: Whether to keep the best model or erase it to save space. Default is True Returns See train_network for returns. """ if (steps is not None and validate_interval > steps) or \ (max_steps is not None and validate_interval > max_steps): raise ValueError('Number of steps must be bigger than validation' + ' interval') # Create folder for training training_path = os.path.join(self._model_dir, 'training') create_dir(training_path) # Create folder for validation validation_path = os.path.join(self._model_dir, 'validation') create_dir(validation_path) # Training outer loop initial_counter = None consecutive, counter = 0, 0 best = (0, np.inf) end = False while not end and consecutive < patience: # Adjust training steps if steps is not None and initial_counter is not None \ and counter + validate_interval > steps: train_steps = steps - counter end = True else: train_steps = validate_interval if not end: # Train network step, _, _ = self._train_network( dataset, logs_path=self._model_dir, batch_size=batch_size, steps=train_steps, max_steps=max_steps, metrics=metrics, track_summaries=track_summaries, track_models=None, gpu_frac=gpu_frac, workers=workers, mem_dequeing=mem_dequeing, log_steps=log_steps) # Set initial step when starting training if initial_counter is None: initial_counter = step - train_steps # Validate using training _, _ = self._eval_network(dataset, logs_path=training_path, batch_size=batch_size, data_mode=DataMode.TRAINING, metrics=metrics, steps=validate_steps, gpu_frac=gpu_frac, workers=workers, mem_dequeing=mem_dequeing) # Validate using validation val_loss, _ = self._eval_network(dataset, logs_path=validation_path, batch_size=batch_size, data_mode=DataMode.VALIDATION, metrics=metrics, steps=validate_steps, gpu_frac=gpu_frac, workers=workers, mem_dequeing=mem_dequeing) # Check if max step reached if max_steps is not None and step >= max_steps: end = True # Check validation loss hasn't decreased if val_loss > best[1]: logger.warn( 'Validation loss at step %d (%f) is worse than' % (step, val_loss) + 'best (%f) at step %d for %d time/s in a row. ' % (best[1], best[0], consecutive)) consecutive += 1 else: logger.warn('Validation results improved at step' + '%d.Previous best was %f at step %d and' % (step, best[1], best[0]) + 'new new best is %f.' % (val_loss)) consecutive = 0 best = (step, val_loss) # Clear old models mu.keep_tracked_models(self._model_dir, patience) # Update counts counter = step - initial_counter best_step, best_loss = best logger.info('Best model found was at step %d with loss %f' % (best_step, best_loss)) # Clean folder. If keep models is True keep best # Otherwise keep no model keep_model = best_step if keep_models is True else None mu.clean_models(self._model_dir, keep_model) return best