Exemplo n.º 1
0
def test_instrument_streamer(slicer, tiny_feats):
    df = tiny_feats.to_df()
    t_len = 10
    batch_size = 12
    if not df.empty:
        streamer = streams.InstrumentStreamer(df,
                                              slicer,
                                              t_len=t_len,
                                              batch_size=batch_size)
        __test_streamer(streamer, t_len, batch_size)
Exemplo n.º 2
0
def test_instrument_streamer_with_zmq(tiny_feats):
    df = tiny_feats.to_df()
    t_len = 10
    batch_size = 12
    if not df.empty:
        streamer = streams.InstrumentStreamer(df,
                                              streams.cqt_slices,
                                              t_len=t_len,
                                              batch_size=batch_size,
                                              use_zmq=True)
        __test_streamer(streamer, t_len, batch_size)
Exemplo n.º 3
0
def test_overfit_two_samples_cqt(tiny_feats):
    """Prove that our network works by training it with two random files
    from our dataset, intentionally overfitting it.

    Warning: not deterministic, but you could make it.
    """
    features_df = tiny_feats.to_df()

    # Get list of instruments
    instruments = sorted(features_df["instrument"].unique())
    selected_instruments = instruments[:2]

    # Create a dataframe from our dataframe with only two files in it
    test_df = pandas.concat([
        features_df[features_df["instrument"] ==
                    selected_instruments[0]].sample(),
        features_df[features_df["instrument"] ==
                    selected_instruments[1]].sample()
    ])

    t_len = 8
    batch_size = 8
    n_targets = 2
    # Create a streamer that samples just those two files.
    streamer = streams.InstrumentStreamer(test_df,
                                          streams.cqt_slices,
                                          t_len=t_len,
                                          batch_size=batch_size)

    # Create a new model
    network_def = models.cqt_iX_c1f1_oY(t_len, n_targets)
    model = models.NetworkManager(network_def)

    # Train the model for N epochs, till it fits the damn thing
    max_batches = 10
    i = 0
    for batch in streamer:
        train_loss = model.train(batch)

        i += 1
        print("Batch: ", i, "Loss: ", train_loss)
        if i >= max_batches:
            break

    # Evaluate it. On the original files. Should do well.
    eval_batch = next(streamer)
    eval_probs = model.predict(eval_batch)
    eval_loss, accuracy = model.evaluate(eval_batch)
    print("Predictions:", eval_probs)
    print("Eval Loss:", eval_loss, "Accuracy:", accuracy)
    assert np.all(np.isfinite(eval_probs)) and np.isfinite(eval_loss) and \
        np.isfinite(accuracy)
def test_predict_dataframe(slicer_and_model, feats_df):
    # For the purposes of this we don't care too much about what we train with.
    # TODO: random seeds for consistency / reproducability.
    test_df = feats_df.sample(n=(12 * 12), replace=True)

    # Pick a model
    t_len = 8
    n_classes = 12
    slicer = slicer_and_model[0]
    network_def = slicer_and_model[1](t_len, n_classes)
    model = models.NetworkManager(network_def)

    # Create the streamer.
    streamer = streams.InstrumentStreamer(test_df,
                                          record_slicer=slicer,
                                          t_len=t_len,
                                          batch_size=12)

    # Train for a little bit.
    iter_count = 0
    max_count = 100
    for batch in streamer:
        loss = model.train(batch)
        print("Batch ", iter_count, "loss:", loss)
        iter_count += 1
        if iter_count >= max_count:
            break

    # Run evaluation on some number of datapoints (10ish),
    #  and make sure you get a dataframe back
    eval_df = hcnn.evaluate.predict.predict_many(test_df, model, slicer, t_len)

    # TODO: why is this even necessary?
    eval_df = eval_df.dropna()

    assert isinstance(eval_df, pandas.DataFrame)
    assert len(eval_df) == len(test_df)

    analyzer = hcnn.evaluate.analyze.PredictionAnalyzer(eval_df, test_df)
    print(analyzer.classification_report)
    print(analyzer.pprint())
Exemplo n.º 5
0
    def train_model(self):
        """
        Train a model, writing intermediate params
        to disk.

        Trains for max_iterations or max_time, whichever is fewer.
        [Specified in the config.]
        """
        if self.skip_training:
            logger.info(utils.colored("--skip_training specified - skipping"))
            return True

        assert hasattr(self, 'train_set') and hasattr(self, 'valid_set')

        logger.info("Starting training for experiment: {}".format(
            self.experiment_name))

        # Save the config we used in the model directory, just in case.
        self.config.save(self._experiment_config_path)

        # Duration parameters
        max_iterations = self.config['training/max_iterations']
        max_time = self.config['training/max_time']  # in seconds

        # Collect various necessary parameters
        t_len = self.config['training/t_len']
        batch_size = self.config['training/batch_size']
        n_targets = self.config['training/n_targets']
        logger.debug(
            "Hyperparams:\nt_len: {}\nbatch_size: {}\n"
            "n_targets: {}\nmax_iterations: {}\nmax_time: {}s or {}h".format(
                t_len, batch_size, n_targets, max_iterations, max_time,
                (max_time / 60. / 60.)))

        slicer = get_slicer_from_feature(self.feature_mode)

        # Set up our streamer
        logger.info("[{}] Setting up streamer".format(self.experiment_name))
        slice_logger = utils.SliceLogger()
        streamer = streams.InstrumentStreamer(
            self.train_set.to_df(),
            slicer,
            slicer_kwargs={'slice_logger': slice_logger},
            t_len=t_len,
            batch_size=batch_size)

        # create our model
        logger.info("[{}] Setting up model: {}".format(self.experiment_name,
                                                       self.model_definition))
        network_def = getattr(models, self.model_definition)(t_len, n_targets)
        model = models.NetworkManager(network_def)

        iter_print_freq = self.config.get('training/iteration_print_frequency',
                                          None)
        iter_write_freq = self.config.get('training/iteration_write_frequency',
                                          None)

        timers = utils.TimerHolder()
        iter_count = 0
        train_stats = pd.DataFrame(
            columns=['timestamp', 'batch_train_dur', 'iteration', 'loss'])
        min_train_loss = np.inf

        timers.start("train")
        logger.info("[{}] Beginning training loop at {}".format(
            self.experiment_name, timers.get("train")))
        try:
            timers.start(("stream", iter_count))
            for batch in streamer:
                timers.end(("stream", iter_count))
                timers.start(("batch_train", iter_count))
                loss = model.train(batch)
                timers.end(("batch_train", iter_count))
                row = dict(timestamp=timers.get_end(
                    ("batch_train", iter_count)),
                           batch_train_dur=timers.get(
                               ("batch_train", iter_count)),
                           iteration=iter_count,
                           loss=loss)
                train_stats.loc[len(train_stats)] = row

                # Time Logging
                logger.debug("[Iter timing] iter: {} | loss: {} | "
                             "stream: {} | train: {}".format(
                                 iter_count, loss,
                                 timers.get(("stream", iter_count)),
                                 timers.get(("batch_train", iter_count))))
                # Print status
                if iter_print_freq and (iter_count % iter_print_freq == 0):
                    mean_train_loss = \
                        train_stats["loss"][-iter_print_freq:].mean()
                    output_str = ("Iteration: {} | Mean_Train_loss: {}".format(
                        iter_count,
                        utils.conditional_colored(mean_train_loss,
                                                  min_train_loss)))

                    # On some small probability, do a randomly sampled
                    # validation so we can see approximately how we're doing
                    # on the validation set.
                    if np.random.random() < .3:
                        timers.start(("sampled_validation", iter_count))
                        valid_loss = self.sampled_validation_loss(
                            model, slicer, t_len)
                        output_str += " | Sampled_Valid_loss: {:0.4f}".format(
                            valid_loss)
                        timers.end(("sampled_validation", iter_count))
                        output_str += " | Val_time: {:0.2f}s".format(
                            timers.get(("sampled_validation",
                                        iter_count)).total_seconds())

                    logger.info(output_str)
                    min_train_loss = min(mean_train_loss, min_train_loss)
                    # Print the mean times for the last n frames
                    logger.debug(
                        "Mean stream time: {}, Mean train time: {}".format(
                            timers.mean("stream", iter_count - iter_print_freq,
                                        iter_count),
                            timers.mean("batch_train",
                                        iter_count - iter_print_freq,
                                        iter_count)))

                # save model, maybe
                if iter_write_freq and (iter_count % iter_write_freq == 0):
                    save_path = os.path.join(
                        self._params_dir,
                        self.param_format_str.format(iter_count))
                    logger.debug("Writing params to {}".format(save_path))
                    model.save(save_path)

                    slice_log = os.path.join(self._cv_model_dir,
                                             "slice_log.csv")
                    slice_logger.save(slice_log)

                if datetime.datetime.now() > \
                        (timers.get("train") + datetime.timedelta(
                            seconds=max_time)):
                    raise EarlyStoppingException("Max Time reached")

                iter_count += 1
                timers.start(("stream", iter_count))
                # Stopping conditions
                if (iter_count >= max_iterations):
                    raise EarlyStoppingException("Max Iterations Reached")

        except KeyboardInterrupt:
            logger.warn(utils.colored("Training Cancelled", "red"))
            print("User cancelled training at epoch:", iter_count)
        except EarlyStoppingException as e:
            logger.warn(
                utils.colored("Training Stopped for {}".format(e), "red"))
            print("Training halted for: ", e)
        timers.end("train")

        # Print final training loss
        logger.info("Total iterations: {}".format(iter_count))
        logger.info("Trained for {}".format(timers.get("train")))
        logger.info("Final training loss: {}".format(
            train_stats["loss"].iloc[-1]))

        # Make sure to save the final iteration's model.
        save_path = os.path.join(self._params_dir,
                                 self.param_format_str.format(iter_count))
        model.save(save_path)
        logger.info("Completed training for experiment: {}".format(
            self.experiment_name))

        # Save training loss
        logger.info("Writing training stats to {}".format(
            self._training_loss_path))
        train_stats.to_pickle(self._training_loss_path)

        # We need these files for models election, so make sure they exist
        return os.path.exists(self._training_loss_path)