Exemplo n.º 1
0
    def batch_predict(
            self,
            model,
            dataset,
            dataset_name=None
    ):
        batcher = dataset.initialize_batcher(
            self._batch_size,
            should_shuffle=False,
            horovod=self._horovod
        )

        progress_bar = None
        if is_on_master():
            progress_bar = tqdm(
                desc='Prediction' if dataset_name is None
                else 'Prediction {0: <5.5}'.format(dataset_name),
                total=batcher.steps_per_epoch,
                file=sys.stdout,
                disable=is_progressbar_disabled()
            )

        predictions = {}
        while not batcher.last_batch():
            batch = batcher.next_batch()

            inputs = {
                i_feat.feature_name: batch[i_feat.proc_column]
                for i_feat in model.input_features.values()
            }

            preds = model.predict_step(inputs)

            # accumulate predictions from batch for each output feature
            for of_name, of_preds in preds.items():
                if of_name not in predictions:
                    predictions[of_name] = {}
                for pred_name, pred_values in of_preds.items():
                    if pred_name not in EXCLUE_PRED_SET:
                        if pred_name not in predictions[of_name]:
                            predictions[of_name][pred_name] = [pred_values]
                        else:
                            predictions[of_name][pred_name].append(pred_values)

            if is_on_master():
                progress_bar.update(1)

        if is_on_master():
            progress_bar.close()

        # consolidate predictions from each batch to a single tensor
        for of_name, of_predictions in predictions.items():
            for pred_name, pred_value_list in of_predictions.items():
                predictions[of_name][pred_name] = tf.concat(pred_value_list,
                                                            axis=0)

        return predictions
Exemplo n.º 2
0
    def postprocess_predictions(self,
                                predictions,
                                metadata,
                                output_directory,
                                skip_save_unprocessed_output=False):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0:
            postprocessed[PREDICTIONS] = predictions[PREDICTIONS].numpy()
            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS),
                        predictions[PREDICTIONS])
            del predictions[PREDICTIONS]

        if PROBABILITIES in predictions and len(
                predictions[PROBABILITIES]) > 0:
            postprocessed[PROBABILITIES] = predictions[PROBABILITIES].numpy()
            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PROBABILITIES),
                        predictions[PROBABILITIES])
            del predictions[PROBABILITIES]

        return postprocessed
Exemplo n.º 3
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script generates a synthetic dataset.',
        prog='ludwig synthesize_dataset',
        usage='%(prog)s [options]')
    parser.add_argument('-od',
                        '--output_path',
                        type=str,
                        help='output CSV file path')
    parser.add_argument('-d',
                        '--dataset_size',
                        help='size of the dataset',
                        type=int,
                        default=100)
    parser.add_argument(
        '-f',
        '--features',
        default='[\
          {name: text_1, type: text, vocab_size: 20, max_len: 20}, \
          {name: text_2, type: text, vocab_size: 20, max_len: 20}, \
          {name: category_1, type: category, vocab_size: 10}, \
          {name: category_2, type: category, vocab_size: 15}, \
          {name: numerical_1, type: numerical}, \
          {name: numerical_2, type: numerical}, \
          {name: binary_1, type: binary}, \
          {name: binary_2, type: binary}, \
          {name: set_1, type: set, vocab_size: 20, max_len: 20}, \
          {name: set_2, type: set, vocab_size: 20, max_len: 20}, \
          {name: bag_1, type: bag, vocab_size: 20, max_len: 10}, \
          {name: bag_2, type: bag, vocab_size: 20, max_len: 10}, \
          {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20}, \
          {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20}, \
          {name: timeseries_1, type: timeseries, max_len: 20}, \
          {name: timeseries_2, type: timeseries, max_len: 20}, \
          {name: date_1, type: date}, \
          {name: date_2, type: date}, \
          {name: h3_1, type: h3}, \
          {name: h3_2, type: h3}, \
          {name: vector_1, type: vector}, \
          {name: vector_2, type: vector}, \
        ]',
        type=yaml.safe_load,
        help='list of features to generate in YAML format. '
        'Provide a list containing one dictionary for each feature, '
        'each dictionary must include a name, a type '
        'and can include some generation parameters depending on the type')
    args = parser.parse_args(sys_argv)

    # No log level parameter this is placeholder if we add at later date
    # args.logging_level = logging_level_registry[args.logging_level]
    # logging.getLogger('ludwig').setLevel(
    #     args.logging_level
    # )
    # global logger
    # logger = logging.getLogger('ludwig.data.dataset_synthesizer')

    if is_on_master():
        print_ludwig('Synthesize Dataset', LUDWIG_VERSION)

    cli_synthesize_dataset(**vars(args))
Exemplo n.º 4
0
    def load_weights(self, model_dir):
        if is_on_master():
            weights_save_path = os.path.join(model_dir,
                                             MODEL_WEIGHTS_FILE_NAME)
            self.model.load_weights(weights_save_path)

        if self._horovod:
            # Model weights are only saved on master, so broadcast
            # to all other ranks
            self._horovod.broadcast_variables(self.model.variables,
                                              root_rank=0)
Exemplo n.º 5
0
    def postprocess_predictions(
            self,
            result,
            metadata,
            output_directory,
            skip_save_unprocessed_output=False,
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in result and len(result[PREDICTIONS]) > 0:
            preds = result[PREDICTIONS]
            if 'idx2str' in metadata:
                postprocessed[PREDICTIONS] = [
                    [metadata['idx2str'][i] for i, pred in enumerate(pred_set)
                     if pred] for pred_set in preds
                ]
            else:
                postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS), preds)

            del result[PREDICTIONS]

        if PROBABILITIES in result and len(result[PROBABILITIES]) > 0:
            probs = result[PROBABILITIES].numpy()
            prob = [[prob for prob in prob_set if
                     prob >= self.threshold] for prob_set in
                    probs]
            postprocessed[PROBABILITIES] = probs
            postprocessed[PROBABILITY] = prob

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PROBABILITIES), probs)
                np.save(npy_filename.format(name, PROBABILITY), probs)

            del result[PROBABILITIES]

        return postprocessed
Exemplo n.º 6
0
    def postprocess_predictions(
            self,
            predictions,
            metadata,
            output_directory,
            skip_save_unprocessed_output=False
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0:
            # as needed convert predictions make to original value space
            numeric_transformer = get_from_registry(
                metadata['preprocessing'].get('normalization', None),
                numeric_transformation_registry
            )(**metadata)
            postprocessed[PREDICTIONS] = \
                numeric_transformer.inverse_transform(
                    predictions[PREDICTIONS].numpy()
                )

            if not skip_save_unprocessed_output:
                np.save(
                    npy_filename.format(name, PREDICTIONS),
                    predictions[PREDICTIONS]
                )
            del predictions[PREDICTIONS]

        if PROBABILITIES in predictions and len(
                predictions[PROBABILITIES]) > 0:
            postprocessed[PROBABILITIES] = predictions[PROBABILITIES].numpy()
            if not skip_save_unprocessed_output:
                np.save(
                    npy_filename.format(name, PROBABILITIES),
                    predictions[PROBABILITIES]
                )
            del predictions[PROBABILITIES]

        return postprocessed
Exemplo n.º 7
0
    def postprocess_predictions(
        self,
        result,
        metadata,
        output_directory,
        skip_save_unprocessed_output=False,
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in result and len(result[PREDICTIONS]) > 0:
            preds = result[PREDICTIONS].numpy()
            if 'bool2str' in metadata:
                preds = [metadata['bool2str'][pred] for pred in preds]
            postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS),
                        postprocessed[PREDICTIONS])
            del result[PREDICTIONS]

        if PROBABILITIES in result and len(result[PROBABILITIES]) > 0:
            postprocessed[PROBABILITIES] = result[PROBABILITIES].numpy()
            postprocessed[PROBABILITIES] = np.stack([
                1 - postprocessed[PROBABILITIES], postprocessed[PROBABILITIES]
            ],
                                                    axis=1)
            postprocessed[PROBABILITY] = np.amax(postprocessed[PROBABILITIES],
                                                 axis=1)
            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PROBABILITIES),
                        postprocessed[PROBABILITIES])
            del result[PROBABILITIES]

        return postprocessed
Exemplo n.º 8
0
    def postprocess_predictions(
            self,
            result,
            metadata,
            output_directory,
            skip_save_unprocessed_output=False,
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in result and len(result[PREDICTIONS]) > 0:
            preds = result[PREDICTIONS].numpy()
            lengths = result[LENGTHS].numpy()
            if 'idx2str' in metadata:
                postprocessed[PREDICTIONS] = [
                    [metadata['idx2str'][token]
                     if token < len(metadata['idx2str']) else UNKNOWN_SYMBOL
                     for token in [pred[i] for i in range(length)]]
                    for pred, length in
                    [(preds[j], lengths[j]) for j in range(len(preds))]
                ]
            else:
                postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS), preds)

            del result[PREDICTIONS]

        if LAST_PREDICTIONS in result and len(result[LAST_PREDICTIONS]) > 0:
            last_preds = result[LAST_PREDICTIONS].numpy()
            if 'idx2str' in metadata:
                postprocessed[LAST_PREDICTIONS] = [
                    metadata['idx2str'][last_pred]
                    if last_pred < len(metadata['idx2str']) else UNKNOWN_SYMBOL
                    for last_pred in last_preds
                ]
            else:
                postprocessed[LAST_PREDICTIONS] = last_preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, LAST_PREDICTIONS),
                        last_preds)

            del result[LAST_PREDICTIONS]

        if PROBABILITIES in result and len(result[PROBABILITIES]) > 0:
            probs = result[PROBABILITIES].numpy()
            if probs is not None:

                # probs should be shape [b, s, nc]
                if len(probs.shape) == 3:
                    # get probability of token in that sequence position
                    seq_probs = np.amax(probs, axis=-1)

                    # sum log probability for tokens up to sequence length
                    # create mask only tokens for sequence length
                    mask = np.arange(seq_probs.shape[-1]) \
                           < np.array(result[LENGTHS]).reshape(-1, 1)
                    log_prob = np.sum(np.log(seq_probs) * mask, axis=-1)

                    # commenting probabilities out because usually it is huge:
                    # dataset x length x classes
                    # todo: add a mechanism for letting the user decide to save it
                    postprocessed[PROBABILITIES] = seq_probs
                    postprocessed[PROBABILITY] = log_prob
                else:
                    raise ValueError(
                        'Sequence probability array should be 3-dimensional '
                        'shape, instead shape is {:d}-dimensional'
                            .format(len(probs.shape))
                    )

                if not skip_save_unprocessed_output:
                    np.save(npy_filename.format(name, PROBABILITIES), seq_probs)
                    np.save(npy_filename.format(name, PROBABILITY), log_prob)

            del result[PROBABILITIES]

        if LENGTHS in result:
            del result[LENGTHS]

        return postprocessed
Exemplo n.º 9
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description="This script searches for optimal Hyperparameters",
        prog="ludwig hyperopt",
        usage="%(prog)s [options]",
    )

    # -------------------
    # Hyperopt parameters
    # -------------------
    parser.add_argument(
        "-sshs",
        "--skip_save_hyperopt_statistics",
        help="skips saving hyperopt statistics file",
        action="store_true",
        default=False,
    )

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument(
        "--output_directory",
        type=str,
        default="results",
        help="directory that contains the results",
    )
    parser.add_argument("--experiment_name",
                        type=str,
                        default="hyperopt",
                        help="experiment name")
    parser.add_argument("--model_name",
                        type=str,
                        default="run",
                        help="name for the model")

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    parser.add_argument(
        "-sspi",
        "--skip_save_processed_input",
        help="skips saving intermediate HDF5 and JSON files",
        action="store_true",
        default=False,
    )

    # ----------------
    # Model parameters
    # ----------------
    config = parser.add_mutually_exclusive_group(required=True)
    config.add_argument("-c", "--config", type=yaml.safe_load, help="config")
    config.add_argument(
        "-cf",
        "--config_file",
        help="YAML file describing the model. Ignores --model_hyperparameters",
    )

    parser.add_argument(
        "-mlp",
        "--model_load_path",
        help="path of a pretrained model to load as initialization",
    )
    parser.add_argument(
        "-mrp",
        "--model_resume_path",
        help="path of the model directory to resume training of",
    )
    parser.add_argument(
        "-sstd",
        "--skip_save_training_description",
        action="store_true",
        default=False,
        help="disables saving the description JSON file",
    )
    parser.add_argument(
        "-ssts",
        "--skip_save_training_statistics",
        action="store_true",
        default=False,
        help="disables saving training statistics JSON file",
    )
    parser.add_argument(
        "-ssm",
        "--skip_save_model",
        action="store_true",
        default=False,
        help="disables saving weights each time the model improves. "
        "By default Ludwig saves  weights after each epoch "
        "the validation metric imrpvoes, but  if the model is really big "
        "that can be time consuming. If you do not want to keep "
        "the weights and just find out what performance a model can get "
        "with a set of hyperparameters, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssp",
        "--skip_save_progress",
        action="store_true",
        default=False,
        help="disables saving weights after each epoch. By default ludwig saves "
        "weights after each epoch for enabling resuming of training, but "
        "if the model is really big that can be time consuming and will "
        "save twice as much space, use this parameter to skip it",
    )
    parser.add_argument(
        "-ssl",
        "--skip_save_log",
        action="store_true",
        default=False,
        help="disables saving TensorBoard logs. By default Ludwig saves "
        "logs for the TensorBoard, but if it is not needed turning it off "
        "can slightly increase the overall speed",
    )

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        "-rs",
        "--random_seed",
        type=int,
        default=42,
        help="a random seed that is going to be used anywhere there is a call "
        "to a random number generator: data splitting, parameter "
        "initialization and training set shuffling",
    )
    parser.add_argument("-g",
                        "--gpus",
                        nargs="+",
                        type=int,
                        default=None,
                        help="list of gpus to use")
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        "-uh",
        "--use_horovod",
        action="store_true",
        default=False,
        help="uses horovod for distributed training",
    )
    parser.add_argument(
        "-dbg",
        "--debug",
        action="store_true",
        default=False,
        help="enables debugging mode",
    )
    parser.add_argument(
        "-l",
        "--logging_level",
        default="info",
        help="the level of logging to use",
        choices=["critical", "error", "warning", "info", "debug", "notset"],
    )

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.hyperopt')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig("Hyperopt", LUDWIG_VERSION)

    hyperopt_cli(**vars(args))
Exemplo n.º 10
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script trains and tests a model',
        prog='ludwig experiment',
        usage='%(prog)s [options]')

    # ----------------------------
    # Experiment naming parameters
    # ----------------------------
    parser.add_argument('--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('--experiment_name',
                        type=str,
                        default='experiment',
                        help='experiment name')
    parser.add_argument('--model_name',
                        type=str,
                        default='run',
                        help='name for the model')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocess file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=['auto', 'csv', 'hdf5'])

    parser.add_argument('-sspi',
                        '--skip_save_processed_input',
                        help='skips saving intermediate HDF5 and JSON files',
                        action='store_true',
                        default=False)
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)

    # -----------------
    # K-fold parameters
    # -----------------
    parser.add_argument(
        '-kf',
        '--k_fold',
        type=int,
        default=None,
        help='number of folds for a k-fold cross validation run ')
    parser.add_argument(
        '-skfsi',
        '--skip_save_k_fold_split_indices',
        action='store_true',
        default=False,
        help='disables saving indices generated to split training data set '
        'for the k-fold cross validation run, but if it is not needed '
        'turning it off can slightly increase the overall speed')

    # ----------------
    # Model parameters
    # ----------------
    model_definition = parser.add_mutually_exclusive_group(required=True)
    model_definition.add_argument('-md',
                                  '--model_definition',
                                  type=yaml.safe_load,
                                  help='model definition')
    model_definition.add_argument(
        '-mdf',
        '--model_definition_file',
        help='YAML file describing the model. Ignores --model_hyperparameters')

    parser.add_argument(
        '-mlp',
        '--model_load_path',
        help='path of a pretrained model to load as initialization')
    parser.add_argument(
        '-mrp',
        '--model_resume_path',
        help='path of a the model directory to resume training of')
    parser.add_argument('-sstd',
                        '--skip_save_training_description',
                        action='store_true',
                        default=False,
                        help='disables saving the description JSON file')
    parser.add_argument('-ssts',
                        '--skip_save_training_statistics',
                        action='store_true',
                        default=False,
                        help='disables saving training statistics JSON file')
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving test predictions CSV files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstes',
                        '--skip_save_eval_stats',
                        help='skips saving eval statistics JSON file',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-ssm',
        '--skip_save_model',
        action='store_true',
        default=False,
        help='disables saving model weights and hyperparameters each time '
        'the model improves. '
        'By default Ludwig saves model weights after each epoch '
        'the validation metric imprvoes, but if the model is really big '
        'that can be time consuming if you do not want to keep '
        'the weights and just find out what performance can a model get '
        'with a set of hyperparameters, use this parameter to skip it,'
        'but the model will not be loadable later on')
    parser.add_argument(
        '-ssp',
        '--skip_save_progress',
        action='store_true',
        default=False,
        help='disables saving progress each epoch. By default Ludwig saves '
        'weights and stats  after each epoch for enabling resuming '
        'of training, but if the model is really big that can be '
        'time consuming and will uses twice as much space, use '
        'this parameter to skip it, but training cannot be resumed '
        'later on')
    parser.add_argument(
        '-ssl',
        '--skip_save_log',
        action='store_true',
        default=False,
        help='disables saving TensorBoard logs. By default Ludwig saves '
        'logs for the TensorBoard, but if it is not needed turning it off '
        'can slightly increase the overall speed')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
        'to a random number generator: data splitting, parameter '
        'initialization and training set shuffling')
    parser.add_argument('-g',
                        '--gpus',
                        nargs='+',
                        type=int,
                        default=None,
                        help='list of GPUs to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.experiment')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Experiment', LUDWIG_VERSION)

    if args.k_fold is None:
        experiment_cli(**vars(args))
    else:
        kfold_cross_validate_cli(**vars(args))
Exemplo n.º 11
0
    def postprocess_predictions(
            self,
            result,
            metadata,
            output_directory,
            skip_save_unprocessed_output=False,
    ):
        # todo: refactor to reuse SequenceOutputFeature.postprocess_predictions
        postprocessed = {}
        name = self.feature_name
        level_idx2str = '{}_{}'.format(self.level, 'idx2str')

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in result and len(result[PREDICTIONS]) > 0:
            preds = result[PREDICTIONS].numpy()
            if level_idx2str in metadata:
                postprocessed[PREDICTIONS] = [
                    [metadata[level_idx2str][token]
                     if token < len(
                        metadata[level_idx2str]) else UNKNOWN_SYMBOL
                     for token in pred]
                    for pred in preds
                ]
            else:
                postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS), preds)

            del result[PREDICTIONS]

        if LAST_PREDICTIONS in result and len(result[LAST_PREDICTIONS]) > 0:
            last_preds = result[LAST_PREDICTIONS].numpy()
            if level_idx2str in metadata:
                postprocessed[LAST_PREDICTIONS] = [
                    metadata[level_idx2str][last_pred]
                    if last_pred < len(
                        metadata[level_idx2str]) else UNKNOWN_SYMBOL
                    for last_pred in last_preds
                ]
            else:
                postprocessed[LAST_PREDICTIONS] = last_preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, LAST_PREDICTIONS),
                        last_preds)

            del result[LAST_PREDICTIONS]

        if PROBABILITIES in result and len(result[PROBABILITIES]) > 0:
            probs = result[PROBABILITIES]
            if probs is not None:

                if len(probs) > 0 and isinstance(probs[0], list):
                    prob = []
                    for i in range(len(probs)):
                        for j in range(len(probs[i])):
                            probs[i][j] = np.max(probs[i][j])
                        prob.append(np.prod(probs[i]))
                else:
                    probs = np.amax(probs, axis=-1)
                    prob = np.prod(probs, axis=-1)

                # commenting probabilities out because usually it is huge:
                # dataset x length x classes
                # todo: add a mechanism for letting the user decide to save it
                # postprocessed[PROBABILITIES] = probs
                postprocessed[PROBABILITY] = prob

                if not skip_save_unprocessed_output:
                    # commenting probabilities out, see comment above
                    # np.save(npy_filename.format(name, PROBABILITIES), probs)
                    np.save(npy_filename.format(name, PROBABILITY), prob)

            del result[PROBABILITIES]

        if LENGTHS in result:
            del result[LENGTHS]

        return postprocessed
Exemplo n.º 12
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script loads a pretrained model '
        'and evaluates its performance by comparing'
        'its predictions with ground truth.',
        prog='ludwig evaluate',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument('--dataset',
                        help='input data file path',
                        required=True)
    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])
    parser.add_argument('-s',
                        '--split',
                        default=FULL,
                        choices=[TRAINING, VALIDATION, TEST, FULL],
                        help='the split to test the model on')

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument('-m',
                        '--model_path',
                        help='model to load',
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument('-od',
                        '--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sses',
                        '--skip_save_eval_stats',
                        help='skips saving intermediate JSON eval statistics',
                        action='store_true',
                        default=False)
    parser.add_argument('-scp',
                        '--skip_collect_predictions',
                        help='skips collecting predictions',
                        action='store_true',
                        default=False)
    parser.add_argument('-scos',
                        '--skip_collect_overall_stats',
                        help='skips collecting overall stats',
                        action='store_true',
                        default=False)

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=128,
                        help='size of batches')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument('-g',
                        '--gpus',
                        type=int,
                        default=0,
                        help='list of gpu to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)
    args.evaluate_performance = True

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.test_performance')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Test', LUDWIG_VERSION)
        logger.info('Dataset path: {}'.format(args.dataset))
        logger.info('Model path: {}'.format(args.model_path))
        logger.info('')

    evaluate_cli(**vars(args))
Exemplo n.º 13
0
    def train(
            self,
            dataset=None,
            training_set=None,
            validation_set=None,
            test_set=None,
            training_set_metadata=None,
            data_format=None,
            experiment_name='api_experiment',
            model_name='run',
            model_resume_path=None,
            skip_save_training_description=False,
            skip_save_training_statistics=False,
            skip_save_model=False,
            skip_save_progress=False,
            skip_save_log=False,
            skip_save_processed_input=False,
            output_directory='results',
            random_seed=default_random_seed,
            debug=False,
            **kwargs
    ):
        """This function is used to perform a full training of the model on the
           specified dataset.

        # Inputs

        :param dataset: (string, dict, DataFrame) source containing the entire dataset.
               If it has a split column, it will be used for splitting (0: train,
               1: validation, 2: test), otherwise the dataset will be randomly split.
        :param training_set: (string, dict, DataFrame) source containing training data.
        :param validation_set: (string, dict, DataFrame) source containing validation data.
        :param test_set: (string, dict, DataFrame) source containing test data.
        :param training_set_metadata: (string, dict) metadata JSON file or loaded metadata.
               Intermediate preprocess structure containing the mappings of the input
               CSV created the first time a CSV file is used in the same
               directory with the same name and a '.json' extension.
        :param data_format: (string) format to interpret data sources. Will be inferred
               automatically if not specified.
        :param experiment_name: (string) a name for the experiment, used for the save
               directory
        :param model_name: (string) a name for the model, used for the save
               directory
        :param model_resume_path: (string) path of a the model directory to
               resume training of
        :param skip_save_training_description: (bool, default: `False`) disables
               saving the description JSON file.
        :param skip_save_training_statistics: (bool, default: `False`) disables
               saving training statistics JSON file.
        :param skip_save_model: (bool, default: `False`) disables
               saving model weights and hyperparameters each time the model
               improves. By default Ludwig saves model weights after each epoch
               the validation metric imrpvoes, but if the model is really big
               that can be time consuming if you do not want to keep
               the weights and just find out what performance can a model get
               with a set of hyperparameters, use this parameter to skip it,
               but the model will not be loadable later on.
        :param skip_save_progress: (bool, default: `False`) disables saving
               progress each epoch. By default Ludwig saves weights and stats
               after each epoch for enabling resuming of training, but if
               the model is really big that can be time consuming and will uses
               twice as much space, use this parameter to skip it, but training
               cannot be resumed later on.
        :param skip_save_log: (bool, default: `False`) disables saving TensorBoard
               logs. By default Ludwig saves logs for the TensorBoard, but if it
               is not needed turning it off can slightly increase the
               overall speed.
        :param skip_save_processed_input: (bool, default: `False`) skips saving
               intermediate HDF5 and JSON files
        :param output_directory: (string, default: `'results'`) directory that
               contains the results
        :param random_seed: (int, default`42`) a random seed that is going to be
               used anywhere there is a call to a random number generator: data
               splitting, parameter initialization and training set shuffling
        :param debug: (bool, default: `False`) enables debugging mode

        There are three ways to provide data: by dataframes using the `_df`
        parameters, by CSV using the `_csv` parameters and by HDF5 and JSON,
        using `_hdf5` and `_json` parameters.
        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while HDF5 and
        JSON load previously preprocessed HDF5 and JSON files (they are saved in
        the same directory of the CSV they are obtained from).
        For all three approaches either a full dataset can be provided (which
        will be split randomly according to the split probabilities defined in
        the model definition, by default 70% training, 10% validation and 20%
        test) or, if it contanins a plit column, it will be plit according to
        that column (interpreting 0 as training, 1 as validation and 2 as test).
        Alternatively separated dataframes / CSV / HDF5 files can beprovided
        for each split.

        During training the model and statistics will be saved in a directory
        `[output_dir]/[experiment_name]_[model_name]_n` where all variables are
        resolved to user spiecified ones and `n` is an increasing number
        starting from 0 used to differentiate different runs.


        # Return

        :return: ((dict, DataFrame)) tuple containing:
            - A dictionary of training statistics for each output feature containing
              loss and metrics values for each epoch. The second return
            - A Pandas DataFrame of preprocessed training data.
        """
        # setup directories and file names
        if model_resume_path is not None:
            if os.path.exists(model_resume_path):
                output_directory = model_resume_path
            else:
                if is_on_master():
                    logger.info(
                        'Model resume path does not exists, '
                        'starting training from scratch'
                    )
                model_resume_path = None

        if model_resume_path is None:
            if is_on_master():
                output_directory = get_output_directory(
                    output_directory,
                    experiment_name,
                    model_name
                )
            else:
                output_directory = None

        # if we are skipping all saving,
        # there is no need to create a directory that will remain empty
        should_create_output_directory = not (
                skip_save_training_description and
                skip_save_training_statistics and
                skip_save_model and
                skip_save_progress and
                skip_save_log and
                skip_save_processed_input
        )

        description_fn = training_stats_fn = model_dir = None
        if is_on_master():
            if should_create_output_directory:
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory, exist_ok=True)
            description_fn, training_stats_fn, model_dir = get_file_names(
                output_directory)

        # save description
        if is_on_master():
            description = get_experiment_description(
                self.model_definition,
                dataset=dataset,
                training_set=training_set,
                validation_set=validation_set,
                test_set=test_set,
                training_set_metadata=training_set_metadata,
                data_format=data_format,
                random_seed=random_seed
            )
            if not skip_save_training_description:
                save_json(description_fn, description)
            # print description
            logger.info('Experiment name: {}'.format(experiment_name))
            logger.info('Model name: {}'.format(model_name))
            logger.info('Output directory: {}'.format(output_directory))
            logger.info('\n')
            for key, value in description.items():
                logger.info('{}: {}'.format(key, pformat(value, indent=4)))
            logger.info('\n')

        # preprocess
        preprocessed_data = preprocess_for_training(
            self.model_definition,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            preprocessing_params=self.model_definition[PREPROCESSING],
            random_seed=random_seed
        )

        (training_set,
         validation_set,
         test_set,
         training_set_metadata) = preprocessed_data
        self.training_set_metadata = training_set_metadata

        if is_on_master():
            logger.info('Training set: {0}'.format(training_set.size))
            if validation_set is not None:
                logger.info('Validation set: {0}'.format(validation_set.size))
            if test_set is not None:
                logger.info('Test set: {0}'.format(test_set.size))

        if is_on_master():
            if not skip_save_model:
                # save train set metadata
                os.makedirs(model_dir, exist_ok=True)
                save_json(
                    os.path.join(
                        model_dir,
                        TRAIN_SET_METADATA_FILE_NAME
                    ),
                    training_set_metadata
                )

        contrib_command("train_init", experiment_directory=output_directory,
                        experiment_name=experiment_name, model_name=model_name,
                        output_directory=output_directory,
                        resume=model_resume_path is not None)

        # Build model if not provided
        # if it was provided it means it was already loaded
        if not self.model:
            if is_on_master():
                print_boxed('MODEL', print_fun=logger.debug)
            # update model definition with metadata properties
            update_model_definition_with_metadata(
                self.model_definition,
                training_set_metadata
            )
            self.model = LudwigModel.create_model(self.model_definition,
                                                  random_seed=random_seed)

        # init trainer
        trainer = Trainer(
            **self.model_definition[TRAINING],
            resume=model_resume_path is not None,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            random_seed=random_seed,
            horoovd=self._horovod,
            debug=debug
        )

        contrib_command("train_model", self.model, self.model_definition,
                        self.model_definition_fp)

        # train model
        if is_on_master():
            print_boxed('TRAINING')
            if not skip_save_model:
                self.save_model_definition(model_dir)

        train_stats = trainer.train(
            self.model,
            training_set,
            validation_set=validation_set,
            test_set=test_set,
            save_path=model_dir,
        )

        train_trainset_stats, train_valiset_stats, train_testset_stats = train_stats
        train_stats = {
            TRAINING: train_trainset_stats,
            VALIDATION: train_valiset_stats,
            TEST: train_testset_stats
        }

        # save training statistics
        if is_on_master():
            if not skip_save_training_statistics:
                save_json(training_stats_fn, train_stats)

        # grab the results of the model with highest validation test performance
        validation_field = trainer.validation_field
        validation_metric = trainer.validation_metric
        validation_field_result = train_valiset_stats[validation_field]

        best_function = get_best_function(validation_metric)
        # results of the model with highest validation test performance
        if is_on_master() and validation_set is not None:
            epoch_best_vali_metric, best_vali_metric = best_function(
                enumerate(validation_field_result[validation_metric]),
                key=lambda pair: pair[1]
            )
            logger.info(
                'Best validation model epoch: {0}'.format(
                    epoch_best_vali_metric + 1)
            )
            logger.info(
                'Best validation model {0} on validation set {1}: {2}'.format(
                    validation_metric, validation_field, best_vali_metric
                ))
            if test_set is not None:
                best_vali_metric_epoch_test_metric = train_testset_stats[
                    validation_field][validation_metric][
                    epoch_best_vali_metric]

                logger.info(
                    'Best validation model {0} on test set {1}: {2}'.format(
                        validation_metric,
                        validation_field,
                        best_vali_metric_epoch_test_metric
                    )
                )
            logger.info(
                '\nFinished: {0}_{1}'.format(experiment_name, model_name))
            logger.info('Saved to: {0}'.format(output_directory))

        contrib_command("train_save", output_directory)

        self.training_set_metadata = training_set_metadata

        if not skip_save_model:
            # Load the best weights from saved checkpoint
            self.load_weights(model_dir)

        return train_stats, preprocessed_data, output_directory
Exemplo n.º 14
0
    def postprocess_predictions(
        self,
        predictions,
        metadata,
        output_directory,
        skip_save_unprocessed_output=False,
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0:
            preds = predictions[PREDICTIONS]
            if 'idx2str' in metadata:
                postprocessed[PREDICTIONS] = [
                    metadata['idx2str'][pred] for pred in preds
                ]

            else:
                postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS), preds)

            del predictions[PREDICTIONS]

        if PROBABILITIES in predictions and len(
                predictions[PROBABILITIES]) > 0:
            probs = predictions[PROBABILITIES].numpy()
            prob = np.amax(probs, axis=1)
            postprocessed[PROBABILITIES] = probs
            postprocessed[PROBABILITY] = prob

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PROBABILITIES), probs)
                np.save(npy_filename.format(name, PROBABILITY), probs)

            del predictions[PROBABILITIES]

        if ('predictions_top_k' in predictions
                and len(predictions['predictions_top_k'])) > 0:

            preds_top_k = predictions['predictions_top_k']
            if 'idx2str' in metadata:
                postprocessed['predictions_top_k'] = [[
                    metadata['idx2str'][pred] for pred in pred_top_k
                ] for pred_top_k in preds_top_k]
            else:
                postprocessed['predictions_top_k'] = preds_top_k

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, 'predictions_top_k'),
                        preds_top_k)

            del predictions['predictions_top_k']

        return postprocessed
Exemplo n.º 15
0
def hyperopt(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = 'hyperopt',
    model_name: str = 'run',
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = False,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_save_hyperopt_statistics: bool = False,
    output_directory: str = 'results',
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    use_horovod: bool = None,
    random_seed: int = default_random_seed,
    debug: bool = False,
    **kwargs,
) -> List[dict]:
    """This method performs an hyperparameter optimization.

    # Inputs

    :param config: (Union[str, dict]) config which defines
        the different parameters of the model, features, preprocessing and
        training.  If `str`, filepath to yaml configuration file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocess
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming if you do not want to keep
        the weights and just find out what performance can a model get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files.
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file.
    :param skip_save_hyperopt_statistics: (bool, default: `False`) skips saving
        hyperopt stats file.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param use_horovod: (bool, default: `None`) flag for using horovod.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[dict]) The results for the hyperparameter optimization
    """
    # check if config is a path or a dict
    if isinstance(config, str):  # assume path
        with open(config, 'r') as def_file:
            config_dict = yaml.safe_load(def_file)
    else:
        config_dict = config

    # merge config with defaults
    config = merge_with_defaults(config_dict)

    if HYPEROPT not in config:
        raise ValueError("Hyperopt Section not present in config")

    hyperopt_config = config["hyperopt"]
    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logger.info(pformat(hyperopt_config, indent=4))
    logger.info('\n')

    sampler = hyperopt_config["sampler"]
    executor = hyperopt_config["executor"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    if split == TRAINING:
        if not training_set and (
                config['preprocessing']['split_probabilities'][0] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == VALIDATION:
        if not validation_set and (
                config['preprocessing']['split_probabilities'][1] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == TEST:
        if not test_set and (config['preprocessing']['split_probabilities'][2]
                             <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         'Please provide one of: {}'.format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = set(of['name']
                                   for of in config['output_features'])
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             'cannot be found in the config. '
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in config['output_features']:
            if of['name'] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                'Available metrics are: {}'.format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_sampler = get_build_hyperopt_sampler(sampler[TYPE])(goal,
                                                                 parameters,
                                                                 **sampler)
    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_results = hyperopt_executor.execute(
        config,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        use_horovod=use_horovod,
        random_seed=random_seed,
        debug=debug,
        **kwargs)

    if is_on_master():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            hyperopt_stats = {
                'hyperopt_config': hyperopt_config,
                'hyperopt_results': hyperopt_results
            }

            save_hyperopt_stats(hyperopt_stats, output_directory)
            logger.info('Hyperopt stats saved to: {}'.format(output_directory))

    logger.info('Finished hyperopt')

    return hyperopt_results
Exemplo n.º 16
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script preprocess a dataset',
        prog='ludwig preprocess',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument(
        '--dataset',
        help='input data file path. '
        'If it has a split column, it will be used for splitting '
        '(0: train, 1: validation, 2: test), '
        'otherwise the dataset will be randomly split')
    parser.add_argument('--training_set', help='input train data file path')
    parser.add_argument('--validation_set',
                        help='input validation data file path')
    parser.add_argument('--test_set', help='input test data file path')

    parser.add_argument(
        '--training_set_metadata',
        help='input metadata JSON file path. An intermediate preprocessed file '
        'containing the mappings of the input file created '
        'the first time a file is used, in the same directory '
        'with the same name and a .json extension')

    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=[
                            'auto', 'csv', 'excel', 'feather', 'fwf', 'hdf5',
                            'html'
                            'tables', 'json', 'jsonl', 'parquet', 'pickle',
                            'sas', 'spss', 'stata', 'tsv'
                        ])

    # ----------------
    # Model parameters
    # ----------------
    preprocessing_def = parser.add_mutually_exclusive_group(required=True)
    preprocessing_def.add_argument(
        '-pd',
        '--preprocessing_config',
        type=yaml.safe_load,
        help='preproceesing config. '
        'Uses the same format of config, '
        'but ignores encoder specific parameters, '
        'decoder specific paramters, combiner and training parameters')
    preprocessing_def.add_argument(
        '-pcf',
        '--preprocessing_config_file',
        help='YAML file describing the preprocessing. '
        'Ignores --preprocessing_config.'
        'Uses the same format of config, '
        'but ignores encoder specific parameters, '
        'decoder specific paramters, combiner and training parameters')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument(
        '-rs',
        '--random_seed',
        type=int,
        default=42,
        help='a random seed that is going to be used anywhere there is a call '
        'to a random number generator: data splitting, parameter '
        'initialization and training set shuffling')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.preprocess')

    if is_on_master():
        print_ludwig('Preprocess', LUDWIG_VERSION)

    preprocess_cli(**vars(args))
Exemplo n.º 17
0
    def batch_evaluation(self,
                         model,
                         dataset,
                         collect_predictions=False,
                         dataset_name=None):
        batcher = initialize_batcher(dataset,
                                     self._batch_size,
                                     should_shuffle=False,
                                     horovod=self._horovod)

        progress_bar = None
        if is_on_master():
            progress_bar = tqdm(desc='Evaluation' if dataset_name is None else
                                'Evaluation {0: <5.5}'.format(dataset_name),
                                total=batcher.steps_per_epoch,
                                file=sys.stdout,
                                disable=is_progressbar_disabled())

        predictions = {}
        while not batcher.last_batch():
            batch = batcher.next_batch()

            inputs = {
                i_feat.feature_name: batch[i_feat.feature_name]
                for i_feat in model.input_features.values()
            }
            targets = {
                o_feat.feature_name: batch[o_feat.feature_name]
                for o_feat in model.output_features.values()
            }

            preds = model.evaluation_step(inputs, targets)

            # accumulate predictions from batch for each output feature
            if collect_predictions:
                for of_name, of_preds in preds.items():
                    if of_name not in predictions:
                        predictions[of_name] = {}
                    for pred_name, pred_values in of_preds.items():
                        if pred_name not in EXCLUE_PRED_SET and pred_values is not None:
                            if pred_name not in predictions[of_name]:
                                predictions[of_name][pred_name] = [pred_values]
                            else:
                                predictions[of_name][pred_name].append(
                                    pred_values)

            if is_on_master():
                progress_bar.update(1)

        if is_on_master():
            progress_bar.close()

        # consolidate predictions from each batch to a single tensor
        if collect_predictions:
            for of_name, of_predictions in predictions.items():
                for pred_name, pred_value_list in of_predictions.items():
                    predictions[of_name][pred_name] = tf.concat(
                        pred_value_list, axis=0)

        metrics = model.get_metrics()
        metrics = self.merge_workers_metrics(metrics)
        model.reset_metrics()

        return metrics, predictions
Exemplo n.º 18
0
def hyperopt(
    model_definition,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    training_set_metadata=None,
    data_format=None,
    experiment_name="hyperopt",
    model_name="run",
    # model_load_path=None,
    # model_resume_path=None,
    skip_save_training_description=True,
    skip_save_training_statistics=True,
    skip_save_model=False,  # False because want use model best validation
    skip_save_progress=True,
    skip_save_log=True,
    skip_save_processed_input=True,
    skip_save_unprocessed_output=True,
    skip_save_predictions=True,
    skip_save_eval_stats=True,
    skip_save_hyperopt_statistics=False,
    output_directory="results",
    gpus=None,
    gpu_memory_limit=None,
    allow_parallel_threads=True,
    use_horovod=None,
    random_seed=default_random_seed,
    debug=False,
    **kwargs,
) -> dict:
    """This method performs an hyperparameter optimization.

    :param model_definition: Model definition which defines the different
           parameters of the model, features, preprocessing and training.
    :type model_definition: Dictionary
    :param dataset: Source containing the entire dataset.
           If it has a split column, it will be used for splitting (0: train,
           1: validation, 2: test), otherwise the dataset will be randomly split.
    :type dataset: Str, Dictionary, DataFrame
    :param training_set: Source containing training data.
    :type training_set: Str, Dictionary, DataFrame
    :param validation_set: Source containing validation data.
    :type validation_set: Str, Dictionary, DataFrame
    :param test_set: Source containing test data.
    :type test_set: Str, Dictionary, DataFrame
    :param training_set_metadata: Metadata JSON file or loaded metadata.
           Intermediate preprocess structure containing the mappings of the input
           CSV created the first time a CSV file is used in the same
           directory with the same name and a '.json' extension.
    :type training_set_metadata: Str, Dictionary
    :param data_format: Format to interpret data sources. Will be inferred
           automatically if not specified.
    :type data_format: Str
    :param experiment_name: The name for the experiment.
    :type experiment_name: Str
    :param model_name: Name of the model that is being used.
    :type model_name: Str
    :param skip_save_training_description: Disables saving
           the description JSON file.
    :type skip_save_training_description: Boolean
    :param skip_save_training_statistics: Disables saving
           training statistics JSON file.
    :type skip_save_training_statistics: Boolean
    :param skip_save_model: Disables
               saving model weights and hyperparameters each time the model
           improves. By default Ludwig saves model weights after each epoch
           the validation metric improves, but if the model is really big
           that can be time consuming if you do not want to keep
           the weights and just find out what performance can a model get
           with a set of hyperparameters, use this parameter to skip it,
           but the model will not be loadable later on.
    :type skip_save_model: Boolean
    :param skip_save_progress: Disables saving
           progress each epoch. By default Ludwig saves weights and stats
           after each epoch for enabling resuming of training, but if
           the model is really big that can be time consuming and will uses
           twice as much space, use this parameter to skip it, but training
           cannot be resumed later on.
    :type skip_save_progress: Boolean
    :param skip_save_log: Disables saving TensorBoard
           logs. By default Ludwig saves logs for the TensorBoard, but if it
           is not needed turning it off can slightly increase the
           overall speed..
    :type skip_save_log: Boolean
    :param skip_save_processed_input: If a CSV dataset is provided it is
           preprocessed and then saved as an hdf5 and json to avoid running
           the preprocessing again. If this parameter is False,
           the hdf5 and json file are not saved.
    :type skip_save_processed_input: Boolean
    :param skip_save_unprocessed_output: By default predictions and
           their probabilities are saved in both raw unprocessed numpy files
           containing tensors and as postprocessed CSV files
           (one for each output feature). If this parameter is True,
           only the CSV ones are saved and the numpy ones are skipped.
    :type skip_save_unprocessed_output: Boolean
    :param skip_save_predictions: skips saving test predictions CSV files
    :type skip_save_predictions: Boolean
    :param skip_save_eval_stats: skips saving test statistics JSON file
    :type skip_save_eval_stats: Boolean
    :param skip_save_hyperopt_statistics: skips saving hyperopt stats file
    :type skip_save_hyperopt_statistics: Boolean
    :param output_directory: The directory that will contain the training
           statistics, the saved model and the training progress files.
    :type output_directory: filepath (str)
    :param gpus: List of GPUs that are available for training.
    :type gpus: List
    :param gpu_memory_limit: maximum memory in MB to allocate per GPU device.
    :type gpu_memory_limit: Integer
    :param allow_parallel_threads: allow TensorFlow to use multithreading parallelism
           to improve performance at the cost of determinism.
    :type allow_parallel_threads: Boolean
    :param use_horovod: Flag for using horovod
    :type use_horovod: Boolean
    :param random_seed: Random seed used for weights initialization,
           splits and any other random function.
    :type random_seed: Integer
    :param debug: If true turns on tfdbg with inf_or_nan checks.
    :type debug: Boolean

    :return: (dict) The results fo the hyperparameter optimization
    """
    # check if model definition is a path or a dict
    if isinstance(model_definition, str):  # assume path
        with open(model_definition, 'r') as def_file:
            model_definition_dict = yaml.safe_load(def_file)
    else:
        model_definition_dict = model_definition

    # merge model definition with defaults
    model_definition = merge_with_defaults(model_definition_dict)

    if HYPEROPT not in model_definition:
        raise ValueError("Hyperopt Section not present in Model Definition")

    hyperopt_config = model_definition["hyperopt"]
    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logger.info(pformat(hyperopt_config, indent=4))
    logger.info('\n')

    sampler = hyperopt_config["sampler"]
    executor = hyperopt_config["executor"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    if split == TRAINING:
        if not training_set and (
                model_definition['preprocessing']['split_probabilities'][0] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    elif split == VALIDATION:
        if not validation_set and (
                model_definition['preprocessing']['split_probabilities'][1] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    elif split == TEST:
        if not test_set and (
                model_definition['preprocessing']['split_probabilities'][2] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         'Please provide one of: {}'.format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = set(
            of['name'] for of in model_definition['output_features'])
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             'cannot be found in the model definition. '
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in model_definition['output_features']:
            if of['name'] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                'Available metrics are: {}'.format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_sampler = get_build_hyperopt_sampler(sampler[TYPE])(goal,
                                                                 parameters,
                                                                 **sampler)
    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_results = hyperopt_executor.execute(
        model_definition,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        use_horovod=use_horovod,
        random_seed=random_seed,
        debug=debug,
        **kwargs)

    if is_on_master():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            hyperopt_stats = {
                'hyperopt_config': hyperopt_config,
                'hyperopt_results': hyperopt_results
            }

            save_hyperopt_stats(hyperopt_stats, output_directory)
            logger.info('Hyperopt stats saved to: {}'.format(output_directory))

    logger.info('Finished hyperopt')

    return hyperopt_results
Exemplo n.º 19
0
    def evaluate(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            skip_save_eval_stats=True,
            collect_predictions=False,
            collect_overall_stats=False,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(
                self.model.output_features,
                predictions,
                dataset,
                training_set_metadata
            )
            stats = {of_name: {**stats[of_name], **overall_stats[of_name]}
            # account for presence of 'combined' key
            if of_name in overall_stats else {**stats[of_name]}
                     for of_name in stats}

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and
                    skip_save_predictions and
                    skip_save_eval_stats
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory
Exemplo n.º 20
0
    def predict(
            self,
            dataset=None,
            data_format=None,
            batch_size=128,
            skip_save_unprocessed_output=True,
            skip_save_predictions=True,
            output_directory='results',
            return_type=pd.DataFrame,
            debug=False,
            **kwargs
    ):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]

        # preprocessing
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=False,
        )

        logger.debug('Predicting')
        predictor = Predictor(
            batch_size=batch_size, horovod=self._horovod, debug=debug
        )
        predictions = predictor.batch_predict(
            self.model,
            dataset,
        )

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (
                    skip_save_unprocessed_output and skip_save_predictions
            )
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        logger.debug('Postprocessing')
        postproc_predictions = convert_predictions(
            postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                                             or not is_on_master(),
            ),
            self.model.output_features,
            self.training_set_metadata,
            return_type=return_type
        )

        if is_on_master():
            if not skip_save_predictions:
                save_prediction_outputs(postproc_predictions,
                                        output_directory)

                logger.info('Saved to: {0}'.format(output_directory))

        return postproc_predictions, output_directory
Exemplo n.º 21
0
    def postprocess_predictions(
        self,
        result,
        metadata,
        output_directory,
        skip_save_unprocessed_output=False,
    ):
        postprocessed = {}
        name = self.feature_name

        npy_filename = None
        if is_on_master():
            npy_filename = os.path.join(output_directory, '{}_{}.npy')
        else:
            skip_save_unprocessed_output = True

        if PREDICTIONS in result and len(result[PREDICTIONS]) > 0:
            preds = result[PREDICTIONS]
            lengths = result[LENGTHS]
            if 'idx2str' in metadata:
                postprocessed[PREDICTIONS] = [[
                    metadata['idx2str'][token]
                    if token < len(metadata['idx2str']) else UNKNOWN_SYMBOL
                    for token in [pred[i] for i in range(length)]
                ] for pred, length in [(preds[j], lengths[j])
                                       for j in range(len(preds))]]
            else:
                postprocessed[PREDICTIONS] = preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, PREDICTIONS), preds)

            del result[PREDICTIONS]

        if LAST_PREDICTIONS in result and len(result[LAST_PREDICTIONS]) > 0:
            last_preds = result[LAST_PREDICTIONS]
            if 'idx2str' in metadata:
                postprocessed[LAST_PREDICTIONS] = [
                    metadata['idx2str'][last_pred]
                    if last_pred < len(metadata['idx2str']) else UNKNOWN_SYMBOL
                    for last_pred in last_preds
                ]
            else:
                postprocessed[LAST_PREDICTIONS] = last_preds

            if not skip_save_unprocessed_output:
                np.save(npy_filename.format(name, LAST_PREDICTIONS),
                        last_preds)

            del result[LAST_PREDICTIONS]

        if PROBABILITIES in result and len(result[PROBABILITIES]) > 0:
            probs = result[PROBABILITIES].numpy()
            if probs is not None:

                if len(probs) > 0 and isinstance(probs[0], list):
                    prob = []
                    for i in range(len(probs)):
                        # todo: should adapt for the case of beam > 1
                        for j in range(len(probs[i])):
                            probs[i][j] = np.max(probs[i][j])
                        prob.append(np.prod(probs[i]))
                elif isinstance(probs, np.ndarray):
                    if (probs.shape) == 3:  # prob of each class of each token
                        probs = np.amax(probs, axis=-1)
                    prob = np.prod(probs, axis=-1)

                # commenting probabilities out because usually it is huge:
                # dataset x length x classes
                # todo: add a mechanism for letting the user decide to save it
                # postprocessed[PROBABILITIES] = probs
                postprocessed[PROBABILITY] = prob

                if not skip_save_unprocessed_output:
                    # commenting probabilities out, see comment above
                    # np.save(npy_filename.format(name, PROBABILITIES), probs)
                    np.save(npy_filename.format(name, PROBABILITY), prob)

            del result[PROBABILITIES]

        if LENGTHS in result:
            del result[LENGTHS]

        return postprocessed
Exemplo n.º 22
0
    def evaluate(self,
                 dataset=None,
                 data_format=None,
                 batch_size=128,
                 skip_save_unprocessed_output=True,
                 skip_save_predictions=True,
                 skip_save_eval_stats=True,
                 collect_predictions=False,
                 collect_overall_stats=False,
                 output_directory='results',
                 return_type=pd.DataFrame,
                 debug=False,
                 **kwargs):
        self._check_initialization()

        logger.debug('Preprocessing')
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'] + \
                           self.model_definition['output_features']

        # preprocessing
        # todo refactoring: maybe replace the self.model_definition paramter
        #  here with features_to_load
        dataset, training_set_metadata = preprocess_for_prediction(
            self.model_definition,
            dataset=dataset,
            data_format=data_format,
            training_set_metadata=self.training_set_metadata,
            include_outputs=True,
        )

        logger.debug('Predicting')
        predictor = Predictor(batch_size=batch_size,
                              horovod=self._horovod,
                              debug=debug)
        stats, predictions = predictor.batch_evaluation(
            self.model,
            dataset,
            collect_predictions=collect_predictions or collect_overall_stats,
        )

        # calculate the overall metrics
        if collect_overall_stats:
            overall_stats = calculate_overall_stats(self.model.output_features,
                                                    predictions, dataset,
                                                    training_set_metadata)
            stats = {
                of_name: {
                    **stats[of_name],
                    **overall_stats[of_name]
                }
                # account for presence of 'combined' key
                if of_name in overall_stats else {
                    **stats[of_name]
                }
                for of_name in stats
            }

        if is_on_master():
            # if we are skipping all saving,
            # there is no need to create a directory that will remain empty
            should_create_exp_dir = not (skip_save_unprocessed_output
                                         and skip_save_predictions
                                         and skip_save_eval_stats)
            if should_create_exp_dir:
                os.makedirs(output_directory, exist_ok=True)

        if collect_predictions:
            logger.debug('Postprocessing')
            postproc_predictions = postprocess(
                predictions,
                self.model.output_features,
                self.training_set_metadata,
                output_directory=output_directory,
                skip_save_unprocessed_output=skip_save_unprocessed_output
                or not is_on_master(),
            )
        else:
            postproc_predictions = predictions  # = {}

        if is_on_master():
            if postproc_predictions is not None and not skip_save_predictions:
                save_prediction_outputs(postproc_predictions, output_directory)

            print_evaluation_stats(stats)
            if not skip_save_eval_stats:
                save_evaluation_stats(stats, output_directory)

            if not skip_save_predictions or not skip_save_eval_stats:
                logger.info('Saved to: {0}'.format(output_directory))

        if collect_predictions:
            postproc_predictions = convert_predictions(
                postproc_predictions,
                self.model.output_features,
                self.training_set_metadata,
                return_type=return_type)

        return stats, postproc_predictions, output_directory
Exemplo n.º 23
0
def hyperopt(
    model_definition,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    training_set_metadata=None,
    data_format=None,
    experiment_name="hyperopt",
    model_name="run",
    # model_load_path=None,
    # model_resume_path=None,
    skip_save_training_description=True,
    skip_save_training_statistics=True,
    skip_save_model=False,  # False because want use model best validation
    skip_save_progress=True,
    skip_save_log=True,
    skip_save_processed_input=True,
    skip_save_unprocessed_output=True,
    skip_save_predictions=True,
    skip_save_eval_stats=True,
    skip_save_hyperopt_statistics=False,
    output_directory="results",
    gpus=None,
    gpu_memory_limit=None,
    allow_parallel_threads=True,
    use_horovod=None,
    random_seed=default_random_seed,
    debug=False,
    **kwargs,
) -> dict:
    """This method performs an hyperparameter optimization.

    :param model_definition:
    :param dataset:
    :param training_set:
    :param validation_set:
    :param test_set:
    :param training_set_metadata:
    :param data_format:
    :param experiment_name:
    :param model_name:
    :param skip_save_training_description:
    :param skip_save_training_statistics:
    :param skip_save_model:
    :param skip_save_progress:
    :param skip_save_log:
    :param skip_save_processed_input:
    :param skip_save_unprocessed_output:
    :param skip_save_predictions:
    :param skip_save_eval_stats:
    :param skip_save_hyperopt_statistics:
    :param output_directory:
    :param gpus:
    :param gpu_memory_limit:
    :param allow_parallel_threads:
    :param use_horovod:
    :param random_seed:
    :param debug:
    :param kwargs:
    :return: (dict) The results fo the hyperparameter optimization
    """
    # todo refactoring: complete docstrings
    # check if model definition is a path or a dict
    if isinstance(model_definition, str):  # assume path
        with open(model_definition, 'r') as def_file:
            model_definition_dict = yaml.safe_load(def_file)
    else:
        model_definition_dict = model_definition

    # merge model definition with defaults
    model_definition = merge_with_defaults(model_definition_dict)

    if HYPEROPT not in model_definition:
        raise ValueError("Hyperopt Section not present in Model Definition")

    hyperopt_config = model_definition["hyperopt"]
    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logger.info(pformat(hyperopt_config, indent=4))
    logger.info('\n')

    sampler = hyperopt_config["sampler"]
    executor = hyperopt_config["executor"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    if split == TRAINING:
        if not training_set and (
                model_definition['preprocessing']['split_probabilities'][0] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    elif split == VALIDATION:
        if not validation_set and (
                model_definition['preprocessing']['split_probabilities'][1] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    elif split == TEST:
        if not test_set and (
                model_definition['preprocessing']['split_probabilities'][2] <=
                0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the model definition is not greater than 0'.format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         'Please provide one of: {}'.format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = set(
            of['name'] for of in model_definition['output_features'])
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             'cannot be found in the model definition. '
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in model_definition['output_features']:
            if of['name'] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                'Available metrics are: {}'.format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_sampler = get_build_hyperopt_sampler(sampler[TYPE])(goal,
                                                                 parameters,
                                                                 **sampler)
    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_results = hyperopt_executor.execute(
        model_definition,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        use_horovod=use_horovod,
        random_seed=random_seed,
        debug=debug,
        **kwargs)

    if is_on_master():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            hyperopt_stats = {
                'hyperopt_config': hyperopt_config,
                'hyperopt_results': hyperopt_results
            }

            save_hyperopt_stats(hyperopt_stats, output_directory)
            logger.info('Hyperopt stats saved to: {}'.format(output_directory))

    logger.info('Finished hyperopt')

    return hyperopt_results
Exemplo n.º 24
0
def cli(sys_argv):
    parser = argparse.ArgumentParser(
        description='This script loads a pretrained model '
        'and uses it to predict',
        prog='ludwig predict',
        usage='%(prog)s [options]')

    # ---------------
    # Data parameters
    # ---------------
    parser.add_argument('--dataset',
                        help='input data file path',
                        required=True)
    parser.add_argument('--data_format',
                        help='format of the input data',
                        default='auto',
                        choices=['auto', 'csv', 'hdf5'])

    # ----------------
    # Model parameters
    # ----------------
    parser.add_argument('-m',
                        '--model_path',
                        help='model to load',
                        required=True)

    # -------------------------
    # Output results parameters
    # -------------------------
    parser.add_argument('-od',
                        '--output_directory',
                        type=str,
                        default='results',
                        help='directory that contains the results')
    parser.add_argument('-ssuo',
                        '--skip_save_unprocessed_output',
                        help='skips saving intermediate NPY output files',
                        action='store_true',
                        default=False)
    parser.add_argument('-sstp',
                        '--skip_save_predictions',
                        help='skips saving predictions CSV files',
                        action='store_true',
                        default=False)

    # ------------------
    # Generic parameters
    # ------------------
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=128,
                        help='size of batches')

    # ------------------
    # Runtime parameters
    # ------------------
    parser.add_argument('-g',
                        '--gpus',
                        type=int,
                        default=0,
                        help='list of gpu to use')
    parser.add_argument('-gml',
                        '--gpu_memory_limit',
                        type=int,
                        default=None,
                        help='maximum memory in MB to allocate per GPU device')
    parser.add_argument(
        '-dpt',
        '--disable_parallel_threads',
        action='store_false',
        dest='allow_parallel_threads',
        help='disable TensorFlow from using multithreading for reproducibility'
    )
    parser.add_argument('-uh',
                        '--use_horovod',
                        action='store_true',
                        default=None,
                        help='uses horovod for distributed training')
    parser.add_argument('-dbg',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='enables debugging mode')
    parser.add_argument(
        '-l',
        '--logging_level',
        default='info',
        help='the level of logging to use',
        choices=['critical', 'error', 'warning', 'info', 'debug', 'notset'])

    args = parser.parse_args(sys_argv)

    args.logging_level = logging_level_registry[args.logging_level]
    logging.getLogger('ludwig').setLevel(args.logging_level)
    global logger
    logger = logging.getLogger('ludwig.predict')

    set_on_master(args.use_horovod)

    if is_on_master():
        print_ludwig('Predict', LUDWIG_VERSION)
        logger.info('Dataset path: {}'.format(args.dataset))
        logger.info('Model path: {}'.format(args.model_path))
        logger.info('')

    predict_cli(**vars(args))