예제 #1
0
 def __init__(self, data_path):
     """ See base class """
     super(MotorSerialize, self).__init__(data_path)
     create_dir(data_path)
     if not is_downloaded(data_path):
         logger.info('Downloading Motor dataset ...')
         urllib.request.urlretrieve(DATA_URL, get_data_path(data_path))
예제 #2
0
    def __init__(self,
                 model_dir,
                 models,
                 outputs,
                 loss_fn,
                 l1_reg=None,
                 l2_reg=None,
                 clip_gradient=None):
        self._model_dir = model_dir
        create_dir(model_dir)
        self._models = models
        self._outputs = outputs
        self._l1_reg = l1_reg
        self._l2_reg = l2_reg
        self._clip_grad = clip_gradient

        # Check model names are unique
        if len(np.unique([m.get_name() for m in models])) != len(models):
            raise ValueError('Models provided should have unique names')

        # Log provided models
        for m in self._models:
            logger.info('Provided model "%s"' % m.get_name())

        if not isinstance(loss_fn, LossFn):
            raise ValueError("Loss must be a subclass of LossFn")
        else:
            self._loss_fn = loss_fn
예제 #3
0
    def serialize(self, output_folder, train_ratio, val_ratio, num_threads,
                  train_shards, val_shards, test_shards):
        """ Serializes the data into a Tensorflow recommended
        Example proto format
        Args:
            output_folder: Output folder for the record files.
            train_ratio: Ratio of instances in the training data.
                If original dataset already split, this is not used.
            val_ratio: Ratio of instances in the validation data.
            num_threads: Threads to use.
            train_shards: Number of files the training set will be split in.
                Must be divisible by the number of threads.
            val_shards: Number of slices the validation set will be split in.
                Must be divisible by the number of threads.
            test_shards: Number of slices the testing set will be split in.
                Must be divisible by the number of threads.
        """

        logger.info("Trying to create dataset into %s" % output_folder)

        if train_ratio > 1.0 or train_ratio < 0.0:
            raise ValueError('Training ratio must be in interval [0, 1]')
        if val_ratio > 1.0 or val_ratio < 0.0:
            raise ValueError('Validation ratio must be in interval [0, 1]')
        if train_ratio + val_ratio >= 1.0:
            raise ValueError('Training and validation ratio exceed 1')
        if os.path.exists(output_folder):
            raise ValueError('Dataset already exists!')

        create_dir(output_folder)

        # Read dataset
        self.settings.initialize()

        # Split according to validation preferences
        logger.info('Splitting into training and validation')
        train, val, test = \
            self.settings.get_validation_indices(train_ratio, val_ratio)

        # Create training files
        self._store_dataset(train, output_folder, train_shards, num_threads,
                            do.DataMode.TRAINING)

        # Create validation files
        self._store_dataset(val, output_folder, val_shards, num_threads,
                            do.DataMode.VALIDATION)

        # Create test files
        self._store_dataset(test, output_folder, test_shards, num_threads,
                            do.DataMode.TEST)

        # Store settings
        self._store_options(output_folder,
                            n_training_instances=len(train),
                            train_ratio=train_ratio,
                            val_ratio=val_ratio)

        # Free resources, if any
        self.settings.finalize()
예제 #4
0
 def __init__(self, data_path):
     """ See base class """
     super(AusSerialize, self).__init__(data_path)
     create_dir(data_path)
     # On-demand download if it does not exist
     if not is_downloaded(data_path):
         logger.info('Downloading Australian dataset ...')
         download(DATA_URL, get_data_path(data_path))
예제 #5
0
    def __init__(self, data_path):
        """ See base class """
        super(Cifar10Serialize, self).__init__(data_path)
        create_dir(data_path)

        if not is_downloaded(data_path):
            logger.info('Downloading CIFAR dataset ...')
            download(data_path)
예제 #6
0
 def __init__(self, data_path):
     """ See base class """
     super(SonarSerialize, self).__init__(data_path)
     create_dir(data_path)
     # On-demand download if it does not exist
     if not is_downloaded(data_path):
         logger.info('Downloading Sonar dataset ...')
         urllib.request.urlretrieve(DATA_URL, get_data_path(data_path))
예제 #7
0
 def __init__(self, data_path):
     """ See base class """
     super(FashionMnistSerialize, self).__init__(data_path)
     create_dir(data_path)
     # On-demand download if it does not exist
     if not is_downloaded(data_path):
         logger.info('Downloading Fashion MNIST dataset ...')
         download(TRAIN_DATA_URL, TRAIN_LABELS_URL, data_path)
         download(TEST_DATA_URL, TEST_LABELS_URL, data_path)
예제 #8
0
    def serialize_folds(self,
                        output_folder,
                        train_ratio,
                        n_folds,
                        num_threads,
                        test_shards,
                        files_per_fold=1):
        """ Serializes the data into a Tensorflow recommended
        Example proto format using N folds. Each fold has its own
        folder with a certain amount of files.
        Args:
            output_folder: Output folder for the record files.
            train_ratio: Ratio of instances in the training set. The rest
                will be included in the test set.
            n_folds: Ratio of instances in the training data.
                If original dataset already split, this is not used.
            num_threads: Number of threads to use.
            test_shards: Number of files to use for testing.
            files_per_fold: Number of files for each training fold.
        """
        logger.info("Trying to create folded dataset into %s" % output_folder)

        if os.path.exists(output_folder):
            raise ValueError('Dataset already exists!')

        create_dir(output_folder)

        # Read dataset
        self.settings.initialize()

        # Split between training and test
        train, _, test = \
            self.settings.get_validation_indices(train_ratio, 0.0)

        # Split training into folds
        idx_per_fold = np.array_split(train, n_folds)

        for fold in range(n_folds):
            self._store_dataset(idx_per_fold[fold], output_folder,
                                files_per_fold, num_threads,
                                'training_fold_%d' % fold)

        # Save test dataset
        self._store_dataset(test, output_folder, test_shards, num_threads,
                            do.DataMode.TEST)

        # Store settings
        fold_size = int(len(train) / n_folds)
        self._store_options(output_folder,
                            train_ratio=train_ratio,
                            fold_size=fold_size,
                            n_training_instances=fold_size * (n_folds - 1),
                            n_folds=n_folds)

        # Free resources, if any
        self.settings.finalize()
예제 #9
0
 def __init__(self, data_path):
     """ See base class """
     super(QuantumSerialize, self).__init__(data_path)
     create_dir(data_path)
     if not is_downloaded(data_path):
         raise RuntimeError(
             'This dataset has been extracted from the KDD Cup 2004. ' +
             'In order to process, please proceed to request a copy in ' +
             'http://osmot.cs.cornell.edu/kddcup/datasets.html. After ' +
             'downloading it, place the extracted content into '
             '%s and repeat this operation' % data_path)
예제 #10
0
    def train_and_validate(self,
                           dataset,
                           batch_size,
                           validate_steps,
                           validate_interval,
                           patience=2,
                           steps=None,
                           max_steps=None,
                           track_summaries=1,
                           metrics=[],
                           gpu_frac=1.0,
                           workers=1,
                           mem_dequeing=3,
                           keep_models=True,
                           log_steps=10):
        """ Trains the network using validation and early stop. Training stops
        when the number of successive increases of validation loss reaches the
        patience value.
        Args:
            See train_network for other arguments.
            validate_steps: Number of validation steps.
            validate_interval: Training steps between validation phases.
            patience: Maximum successive number of validation loss increase
                that we want to allow.
            keep_models: Whether to keep the best model or erase it to save
                space. Default is True
        Returns
            See train_network for returns.
        """

        if (steps is not None and validate_interval > steps) or \
                (max_steps is not None and validate_interval > max_steps):
            raise ValueError('Number of steps must be bigger than validation' +
                             ' interval')

        # Create folder for training
        training_path = os.path.join(self._model_dir, 'training')
        create_dir(training_path)

        # Create folder for validation
        validation_path = os.path.join(self._model_dir, 'validation')
        create_dir(validation_path)

        # Training outer loop
        initial_counter = None
        consecutive, counter = 0, 0
        best = (0, np.inf)
        end = False
        while not end and consecutive < patience:

            # Adjust training steps
            if steps is not None and initial_counter is not None \
                    and counter + validate_interval > steps:
                train_steps = steps - counter
                end = True
            else:
                train_steps = validate_interval

            if not end:

                # Train network
                step, _, _ = self._train_network(
                    dataset,
                    logs_path=self._model_dir,
                    batch_size=batch_size,
                    steps=train_steps,
                    max_steps=max_steps,
                    metrics=metrics,
                    track_summaries=track_summaries,
                    track_models=None,
                    gpu_frac=gpu_frac,
                    workers=workers,
                    mem_dequeing=mem_dequeing,
                    log_steps=log_steps)

                # Set initial step when starting training
                if initial_counter is None:
                    initial_counter = step - train_steps

                # Validate using training
                _, _ = self._eval_network(dataset,
                                          logs_path=training_path,
                                          batch_size=batch_size,
                                          data_mode=DataMode.TRAINING,
                                          metrics=metrics,
                                          steps=validate_steps,
                                          gpu_frac=gpu_frac,
                                          workers=workers,
                                          mem_dequeing=mem_dequeing)

                # Validate using validation
                val_loss, _ = self._eval_network(dataset,
                                                 logs_path=validation_path,
                                                 batch_size=batch_size,
                                                 data_mode=DataMode.VALIDATION,
                                                 metrics=metrics,
                                                 steps=validate_steps,
                                                 gpu_frac=gpu_frac,
                                                 workers=workers,
                                                 mem_dequeing=mem_dequeing)

                # Check if max step reached
                if max_steps is not None and step >= max_steps:
                    end = True

                # Check validation loss hasn't decreased
                if val_loss > best[1]:
                    logger.warn(
                        'Validation loss at step %d (%f) is worse than' %
                        (step, val_loss) +
                        'best (%f) at step %d for %d time/s in a row. ' %
                        (best[1], best[0], consecutive))
                    consecutive += 1
                else:
                    logger.warn('Validation results improved at step' +
                                '%d.Previous best was %f at step %d and' %
                                (step, best[1], best[0]) +
                                'new new best is %f.' % (val_loss))
                    consecutive = 0
                    best = (step, val_loss)

                # Clear old models
                mu.keep_tracked_models(self._model_dir, patience)

                # Update counts
                counter = step - initial_counter

        best_step, best_loss = best
        logger.info('Best model found was at step %d with loss %f' %
                    (best_step, best_loss))

        # Clean folder. If keep models is True keep best
        #  Otherwise keep no model
        keep_model = best_step if keep_models is True else None
        mu.clean_models(self._model_dir, keep_model)

        return best