예제 #1
0
def kfold_cross_validate(num_folds,
                         model_definition=None,
                         model_definition_file=None,
                         data_csv=None,
                         output_directory='results',
                         random_seed=default_random_seed,
                         **kwargs):
    # check for k_fold
    if num_folds is None:
        raise ValueError('k_fold parameter must be specified')

    # check for model_definition and model_definition_file
    if model_definition is None and model_definition_file is None:
        raise ValueError(
            'Either model_definition of model_definition_file have to be'
            'not None to initialize a LudwigModel')
    if model_definition is not None and model_definition_file is not None:
        raise ValueError('Only one between model_definition and '
                         'model_definition_file can be provided')

    logger.info('starting {:d}-fold cross validation'.format(num_folds))

    # extract out model definition for use
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = \
                merge_with_defaults(yaml.safe_load(def_file))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)

    kfold_cv_stats = {}
    kfold_split_indices = {}

    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, num_folds, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            kfold_split_indices['fold_' + str(fold_num)] = {
                'training_indices': train_indices,
                'test_indices': test_indices
            }

            # train and validate model on this fold
            logger.info("training on fold {:d}".format(fold_num))
            (
                _,  # model
                preprocessed_data,  # preprocessed_data
                experiment_dir_name,  # experiment_dir_name
                train_stats,
                model_definition,
                test_results) = experiment(model_definition,
                                           data_train_df=curr_train_df,
                                           data_test_df=curr_test_df,
                                           experiment_name='cross_validation',
                                           model_name='fold_' + str(fold_num),
                                           output_directory=os.path.join(
                                               temp_dir_name, 'results'))

            # todo this works for obtaining the postprocessed prediction
            #  and replace the raw ones, but some refactoring is needed to
            #  avoid having to do it
            postprocessed_output = postprocess(
                test_results,
                model_definition['output_features'],
                metadata=preprocessed_data[3],
                experiment_dir_name=experiment_dir_name,
                skip_save_unprocessed_output=True)
            # todo if we want to save the csv of predictions uncomment block
            # if is_on_master():
            #     print_test_results(test_results)
            #     if not skip_save_test_predictions:
            #         save_prediction_outputs(
            #             postprocessed_output,
            #             experiment_dir_name
            #         )
            #     if not skip_save_test_statistics:
            #         save_test_statistics(test_results, experiment_dir_name)

            # augment the training statistics with scoring metric from
            # the hold out fold
            train_stats['fold_test_results'] = test_results

            # collect training statistics for this fold
            kfold_cv_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_cv_stats:
        curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results']
        for of_name in curr_fold_test_results:
            if of_name not in raw_kfold_stats:
                raw_kfold_stats[of_name] = {}
            fold_test_results_of = curr_fold_test_results[of_name]

            for metric in fold_test_results_of:
                if metric not in {
                        'predictions', 'probabilities', 'confusion_matrix',
                        'overall_stats', 'per_class_stats', 'roc_curve',
                        'precision_recall_curve'
                }:
                    if metric not in raw_kfold_stats[of_name]:
                        raw_kfold_stats[of_name][metric] = []
                    raw_kfold_stats[of_name][metric].append(
                        fold_test_results_of[metric])

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for of_name in raw_kfold_stats:
        overall_kfold_stats[of_name] = {}
        for metric in raw_kfold_stats[of_name]:
            mean = np.mean(raw_kfold_stats[of_name][metric])
            std = np.std(raw_kfold_stats[of_name][metric])
            overall_kfold_stats[of_name][metric + '_mean'] = mean
            overall_kfold_stats[of_name][metric + '_std'] = std

    kfold_cv_stats['overall'] = overall_kfold_stats

    logger.info('completed {:d}-fold cross validation'.format(num_folds))

    return kfold_cv_stats, kfold_split_indices
예제 #2
0
def kfold_cross_validate(
        num_folds,
        model_definition,
        data_csv=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        skip_collect_predictions=False,
        skip_collect_overall_stats=False,
        output_directory='results',
        random_seed=default_random_seed,
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=None,
        logging_level=logging.INFO,
        debug=False,
        **kwargs
):
    """Performs k-fold cross validation and returns result data structures.

    # Inputs

    :param num_folds: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
           information needed to build a model. Refer to the
           [User Guide](http://ludwig.ai/user_guide/#model-definition)
           for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (dataframe, default: None)
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.

    # Return

    :return: (tuple(kfold_cv_stats, kfold_split_indices), dict) a tuple of
            dictionaries `kfold_cv_stats`: contains metrics from cv run.
             `kfold_split_indices`: indices to split training data into
             training fold and test fold.
    """
    set_on_master(use_horovod)

    # check for k_fold
    if num_folds is None:
        raise ValueError(
            'k_fold parameter must be specified'
        )

    logger.info('starting {:d}-fold cross validation'.format(num_folds))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)

    kfold_cv_stats = {}
    kfold_split_indices = {}

    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, num_folds, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            kfold_split_indices['fold_' + str(fold_num)] = {
                'training_indices': train_indices,
                'test_indices': test_indices
            }

            # train and validate model on this fold
            logger.info("training on fold {:d}".format(fold_num))

            model = LudwigModel(
                model_definition=model_definition,
                logging_level=logging_level,
                use_horovod=use_horovod,
                gpus=gpus,
                gpu_memory_limit=gpu_memory_limit,
                allow_parallel_threads=allow_parallel_threads,
            )
            (
                test_results,
                train_stats,
                preprocessed_data,
                output_directory
            ) = model.experiment(
                training_set=curr_train_df,
                test_set=curr_test_df,
                experiment_name='cross_validation',
                model_name='fold_' + str(fold_num),
                skip_save_training_description=skip_save_training_description,
                skip_save_training_statistics=skip_save_training_statistics,
                skip_save_model=skip_save_model,
                skip_save_progress=skip_save_progress,
                skip_save_log=skip_save_log,
                skip_save_processed_input=skip_save_processed_input,
                skip_save_predictions=skip_save_predictions,
                skip_save_eval_stats=skip_save_eval_stats,
                skip_collect_predictions=skip_collect_predictions,
                skip_collect_overall_stats=skip_collect_overall_stats,
                output_directory=os.path.join(temp_dir_name, 'results'),
                random_seed=random_seed,
                debug=debug,
            )

            # augment the training statistics with scoring metric from
            # the hold out fold
            train_stats['fold_test_results'] = test_results

            # collect training statistics for this fold
            kfold_cv_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_cv_stats:
        curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results']
        for of_name in curr_fold_test_results:
            if of_name not in raw_kfold_stats:
                raw_kfold_stats[of_name] = {}
            fold_test_results_of = curr_fold_test_results[of_name]

            for metric in fold_test_results_of:
                if metric not in {
                    'predictions',
                    'probabilities',
                    'confusion_matrix',
                    'overall_stats',
                    'per_class_stats',
                    'roc_curve',
                    'precision_recall_curve'
                }:
                    if metric not in raw_kfold_stats[of_name]:
                        raw_kfold_stats[of_name][metric] = []
                    raw_kfold_stats[of_name][metric].append(
                        fold_test_results_of[metric]
                    )

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for of_name in raw_kfold_stats:
        overall_kfold_stats[of_name] = {}
        for metric in raw_kfold_stats[of_name]:
            mean = np.mean(raw_kfold_stats[of_name][metric])
            std = np.std(raw_kfold_stats[of_name][metric])
            overall_kfold_stats[of_name][metric + '_mean'] = mean
            overall_kfold_stats[of_name][metric + '_std'] = std

    kfold_cv_stats['overall'] = overall_kfold_stats

    logger.info('completed {:d}-fold cross validation'.format(num_folds))

    return kfold_cv_stats, kfold_split_indices
예제 #3
0
def kfold_cross_validate(num_folds,
                         model_definition=None,
                         model_definition_file=None,
                         data_csv=None,
                         output_directory='results',
                         random_seed=default_random_seed,
                         **kwargs):
    # check for k_fold
    if num_folds is None:
        raise ValueError('k_fold parameter must be specified')

    # check for model_definition and model_definition_file
    if model_definition is None and model_definition_file is None:
        raise ValueError(
            'Either model_definition of model_definition_file have to be'
            'not None to initialize a LudwigModel')
    if model_definition is not None and model_definition_file is not None:
        raise ValueError('Only one between model_definition and '
                         'model_definition_file can be provided')

    logger.info('starting {:d}-fold cross validation'.format(num_folds))

    # extract out model definition for use
    if model_definition_file is not None:
        with open(model_definition_file, 'r') as def_file:
            model_definition = \
                merge_with_defaults(yaml.safe_load(def_file))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)

    kfold_cv_stats = {}
    kfold_split_indices = {}

    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, num_folds, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            kfold_split_indices['fold_' + str(fold_num)] = {
                'training_indices': train_indices,
                'test_indices': test_indices
            }

            # train and validate model on this fold
            logger.info("training on fold {:d}".format(fold_num))
            (
                _,  # model
                _,  # preprocessed_data
                _,  # experiment_dir_name
                train_stats,
                model_definition,
                test_results) = experiment(model_definition,
                                           data_train_df=curr_train_df,
                                           data_test_df=curr_test_df,
                                           experiment_name='cross_validation',
                                           model_name='fold_' + str(fold_num),
                                           output_directory=os.path.join(
                                               temp_dir_name, 'results'))

            # augment the training statistics with scoring metric from
            # the hold out fold
            train_stats['fold_metric'] = {}
            for metric_category in test_results:
                train_stats['fold_metric'][metric_category] = {}
                for metric in test_results[metric_category]:
                    train_stats['fold_metric'][metric_category][metric] = \
                        test_results[metric_category][metric]

            # collect training statistics for this fold
            kfold_cv_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_cv_stats:
        for category in kfold_cv_stats[fold_name]['fold_metric']:
            if category not in raw_kfold_stats:
                raw_kfold_stats[category] = {}
            category_stats = \
                kfold_cv_stats[fold_name]['fold_metric'][category]
            for metric in category_stats:
                if metric not in {
                        'predictions', 'probabilities', 'confusion_matrix',
                        'overall_stats', 'per_class_stats', 'roc_curve',
                        'precision_recall_curve'
                }:
                    if metric not in raw_kfold_stats[category]:
                        raw_kfold_stats[category][metric] = []
                    raw_kfold_stats[category][metric] \
                        .append(category_stats[metric])

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for category in raw_kfold_stats:
        overall_kfold_stats[category] = {}
        for metric in raw_kfold_stats[category]:
            mean = np.mean(raw_kfold_stats[category][metric])
            std = np.std(raw_kfold_stats[category][metric])
            overall_kfold_stats[category][metric + '_mean'] = mean
            overall_kfold_stats[category][metric + '_std'] = std

    kfold_cv_stats['overall'] = overall_kfold_stats

    logger.info('completed {:d}-fold cross validation'.format(num_folds))

    return kfold_cv_stats, kfold_split_indices
예제 #4
0
def kfold_cross_validate(k_fold,
                         model_definition=None,
                         model_definition_file=None,
                         data_csv=None,
                         output_directory='results',
                         random_seed=default_random_seed,
                         skip_save_k_fold_split_indices=False,
                         **kwargs):
    """Performs k-fold cross validation.

    # Inputs
    :param k_fold: (int) number of folds to create for the cross-validation
    :param model_definition: (dict, default: None) a dictionary containing
            information needed to build a model. Refer to the [User Guide]
           (http://ludwig.ai/user_guide/#model-definition) for details.
    :param model_definition_file: (string, optional, default: `None`) path to
           a YAML file containing the model definition. If available it will be
           used instead of the model_definition dict.
    :param data_csv: (string, default: None)
    :param output_directory: (string, default: 'results')
    :param random_seed: (int) Random seed used k-fold splits.
    :param skip_save_k_fold_split_indices: (boolean, default: False) Disables
            saving k-fold split indices

    :return: None
    """

    # check for model_definition and model_definition_file
    if model_definition is None and model_definition_file is None:
        raise ValueError(
            'Either model_definition of model_definition_file have to be'
            'not None to initialize a LudwigModel')
    if model_definition is not None and model_definition_file is not None:
        raise ValueError('Only one between model_definition and '
                         'model_definition_file can be provided')

    # check for k_fold
    if k_fold is None:
        raise ValueError('k_fold parameter must be specified')

    logger.info('starting {:d}-fold cross validation'.format(k_fold))

    # create output_directory if not available
    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)

    # read in data to split for the folds
    data_df = pd.read_csv(data_csv)

    # place each fold in a separate directory
    data_dir = os.path.dirname(data_csv)
    kfold_training_stats = {}
    kfold_split_indices = {}
    for train_indices, test_indices, fold_num in \
            generate_kfold_splits(data_df, k_fold, random_seed):
        with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
            curr_train_df = data_df.iloc[train_indices]
            curr_test_df = data_df.iloc[test_indices]

            if not skip_save_k_fold_split_indices:
                kfold_split_indices['fold_' + str(fold_num)] = {
                    'training_indices': train_indices,
                    'test_indices': test_indices
                }

            # train and validate model on this fold
            if model_definition_file is not None:
                with open(model_definition_file, 'r') as def_file:
                    model_definition = \
                        merge_with_defaults(yaml.safe_load(def_file))
            logger.info("training on fold {:d}".format(fold_num))
            (model, preprocessed_data, _, train_stats,
             model_definition) = full_train(model_definition,
                                            data_train_df=curr_train_df,
                                            data_test_df=curr_test_df,
                                            experiment_name='cross_validation',
                                            model_name='fold_' + str(fold_num),
                                            output_directory=os.path.join(
                                                temp_dir_name, 'results'))

            # score on hold out fold
            eval_batch_size = model_definition['training']['eval_batch_size']
            batch_size = model_definition['training']['batch_size']
            preds = model.predict(
                preprocessed_data[2],
                eval_batch_size if eval_batch_size != 0 else batch_size)

            # augment the training statistics with scoring metric fron
            # the hold out fold
            train_stats['fold_metric'] = {}
            for metric_category in preds:
                train_stats['fold_metric'][metric_category] = {}
                for metric in preds[metric_category]:
                    train_stats['fold_metric'][metric_category][metric] = \
                        preds[metric_category][metric]

            # collect training statistics for this fold
            kfold_training_stats['fold_' + str(fold_num)] = train_stats

    # consolidate raw fold metrics across all folds
    raw_kfold_stats = {}
    for fold_name in kfold_training_stats:
        for category in kfold_training_stats[fold_name]['fold_metric']:
            if category not in raw_kfold_stats:
                raw_kfold_stats[category] = {}
            category_stats = \
                kfold_training_stats[fold_name]['fold_metric'][category]
            for metric in category_stats:
                if metric not in {'predictions', 'probabilities'}:
                    if metric not in raw_kfold_stats[category]:
                        raw_kfold_stats[category][metric] = []
                    raw_kfold_stats[category][metric] \
                        .append(category_stats[metric])

    # calculate overall kfold statistics
    overall_kfold_stats = {}
    for category in raw_kfold_stats:
        overall_kfold_stats[category] = {}
        for metric in raw_kfold_stats[category]:
            mean = np.mean(raw_kfold_stats[category][metric])
            std = np.std(raw_kfold_stats[category][metric])
            overall_kfold_stats[category][metric + '_mean'] = mean
            overall_kfold_stats[category][metric + '_std'] = std

    kfold_training_stats['overall'] = overall_kfold_stats

    # save k-fold cv statistics
    save_json(os.path.join(output_directory, 'kfold_training_statistics.json'),
              kfold_training_stats)

    # save k-fold split indices
    if not skip_save_k_fold_split_indices:
        save_json(os.path.join(output_directory, 'kfold_split_indices.json'),
                  kfold_split_indices)

    logger.info('completed {:d}-fold cross validation'.format(k_fold))