예제 #1
0
    def _get_all_general_section_names(self):

        reporter = Reporter()
        default_general_sections_value = DEFAULTS.get('general_sections', '')
        default_special_sections_value = DEFAULTS.get('special_sections', '')
        default_custom_sections_value = DEFAULTS.get('custom_sections', '')

        # if we are told ot use subgroups then just make up a dummy subgroup
        # value so that the subgroup-based sections will be included in the
        # section list. This value is not actually used in configuration file.
        subgroups_value = ['GROUP'] if self.use_subgroups else DEFAULTS.get('subgroups', '')
        return reporter.determine_chosen_sections(default_general_sections_value,
                                                  default_special_sections_value,
                                                  default_custom_sections_value,
                                                  subgroups_value,
                                                  context=self.context)
def test_run_experiment_lr_with_notebook_rerun():
    # basic experiment with LinearRegression model and notebook
    # run-run after the experiment after `RSM_REPORT_DIR` is deleted
    # to ensure that the `.environ.json` file can be located

    source = 'lr-with-notebook-rerun'
    experiment_id = 'lr'
    config_file = join(rsmtool_test_dir, 'data', 'experiments', source,
                       '{}.json'.format(experiment_id))
    do_run_experiment(source, experiment_id, config_file)

    report_ipynb = join('test_outputs', source, 'report',
                        '{}_report.ipynb'.format(experiment_id))
    report_html = join('test_outputs', source, 'report',
                       '{}_report.html'.format(experiment_id))

    del os.environ['RSM_REPORT_DIR']

    Reporter.convert_ipynb_to_html(report_ipynb, report_html)
    check_report(report_html)
def test_run_experiment_lr_with_notebook_rerun():
    # basic experiment with LinearRegression model and notebook
    # run-run after the experiment after `RSM_REPORT_DIR` is deleted
    # to ensure that the `.environ.json` file can be located

    source = 'lr-with-notebook-rerun'
    experiment_id = 'lr'
    config_file = join(rsmtool_test_dir,
                       'data',
                       'experiments',
                       source,
                       '{}.json'.format(experiment_id))
    do_run_experiment(source, experiment_id, config_file)

    report_ipynb = join('test_outputs', source, 'report', '{}_report.ipynb'.format(experiment_id))
    report_html = join('test_outputs', source, 'report', '{}_report.html'.format(experiment_id))

    del os.environ['RSM_REPORT_DIR']

    Reporter.convert_ipynb_to_html(report_ipynb, report_html)
    check_report(report_html)
def test_run_experiment_lr_with_notebook_rerun_fail():
    # basic experiment with LinearRegression model and notebook
    # run-run after the experiment after `RSM_REPORT_DIR` is deleted
    # and `.environ.json` is deleted, so the notebook execution will fail

    source = 'lr-with-notebook-rerun-fail'
    experiment_id = 'lr'
    config_file = join(rsmtool_test_dir, 'data', 'experiments', source,
                       '{}.json'.format(experiment_id))
    do_run_experiment(source, experiment_id, config_file)

    report_env = join('test_outputs', source, 'report',
                      '.environ.json'.format(experiment_id))
    report_ipynb = join('test_outputs', source, 'report',
                        '{}_report.ipynb'.format(experiment_id))
    report_html = join('test_outputs', source, 'report',
                       '{}_report.html'.format(experiment_id))

    del os.environ['RSM_REPORT_DIR']
    os.remove(report_env)

    Reporter.convert_ipynb_to_html(report_ipynb, report_html)
def test_run_experiment_lr_with_notebook_rerun_fail():
    # basic experiment with LinearRegression model and notebook
    # run-run after the experiment after `RSM_REPORT_DIR` is deleted
    # and `.environ.json` is deleted, so the notebook execution will fail

    source = 'lr-with-notebook-rerun-fail'
    experiment_id = 'lr'
    config_file = join(rsmtool_test_dir,
                       'data',
                       'experiments',
                       source,
                       '{}.json'.format(experiment_id))
    do_run_experiment(source, experiment_id, config_file)

    report_env = join('test_outputs', source, 'report', '.environ.json'.format(experiment_id))
    report_ipynb = join('test_outputs', source, 'report', '{}_report.ipynb'.format(experiment_id))
    report_html = join('test_outputs', source, 'report', '{}_report.html'.format(experiment_id))

    del os.environ['RSM_REPORT_DIR']
    os.remove(report_env)

    Reporter.convert_ipynb_to_html(report_ipynb, report_html)
예제 #6
0
def run_comparison(config_file_or_obj, output_dir):
    """
    Run an ``rsmcompare`` experiment using the given configuration
    file and generate the report in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmcompare')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:
        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # get the information about the "old" experiment
    experiment_id_old = configuration['experiment_id_old']
    experiment_dir_old = DataReader.locate_files(
        configuration['experiment_dir_old'], configpath)
    if not experiment_dir_old:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_old']))
    else:
        csvdir_old = normpath(join(experiment_dir_old, 'output'))
        figdir_old = normpath(join(experiment_dir_old, 'figure'))
        if not exists(csvdir_old) or not exists(figdir_old):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_old))

    check_experiment_id(experiment_dir_old, experiment_id_old)

    # get the information about the "new" experiment
    experiment_id_new = configuration['experiment_id_new']
    experiment_dir_new = DataReader.locate_files(
        configuration['experiment_dir_new'], configpath)
    if not experiment_dir_new:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_new']))
    else:
        csvdir_new = normpath(join(experiment_dir_new, 'output'))
        figdir_new = normpath(join(experiment_dir_new, 'figure'))
        if not exists(csvdir_new) or not exists(figdir_new):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_new))

    check_experiment_id(experiment_dir_new, experiment_id_new)

    # are there specific general report sections we want to include?
    general_report_sections = configuration['general_sections']

    # what about the special or custom sections?
    special_report_sections = configuration['special_sections']

    custom_report_section_paths = configuration['custom_sections']

    # if custom report sections exist, locate sections; otherwise, create empty list
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(
            custom_report_section_paths, configpath)
    else:
        custom_report_sections = []

    # get the section order
    section_order = configuration['section_order']

    # get the subgroups if any
    subgroups = configuration.get('subgroups')

    # Initialize reporter
    reporter = Reporter()

    chosen_notebook_files = reporter.get_ordered_notebook_files(
        general_report_sections,
        special_report_sections,
        custom_report_sections,
        section_order,
        subgroups,
        model_type=None,
        context='rsmcompare')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation.')
    reporter.create_comparison_report(configuration, csvdir_old, figdir_old,
                                      csvdir_new, figdir_new, output_dir)
예제 #7
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
예제 #8
0
def run_summary(config_file_or_obj, output_dir):
    """
    Run rsmsummarize experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))

    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                          context='rsmsummarize')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # get the list of the experiment dirs
    experiment_dirs = configuration['experiment_dirs']

    # Get experiment names if any
    experiment_names = configuration.get('experiment_names')
    experiment_names = experiment_names if experiment_names else [None] * len(experiment_dirs)
    dirs_with_names = zip(experiment_dirs, experiment_names)

    # check the experiment dirs and assemble the list of csvdir and jsons
    all_experiments = []
    for (experiment_dir, experiment_name) in dirs_with_names:
        experiments = check_experiment_dir(experiment_dir,
                                           experiment_name,
                                           configpath)
        all_experiments.extend(experiments)

    # get the subgroups if any
    # Note: at the moment no comparison are reported for subgroups.
    # this option is added to the code to make it easier to add
    # subgroup comparisons in future versions
    subgroups = configuration.get('subgroups')

    general_report_sections = configuration['general_sections']

    # get any special sections that the user might have specified
    special_report_sections = configuration['special_sections']

    # get any custom sections and locate them to make sure
    # that they exist, otherwise raise an exception
    custom_report_section_paths = configuration['custom_sections']
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths,
                                                                 configpath)
    else:
        custom_report_sections = []

    section_order = configuration['section_order']

    # Initialize reporter
    reporter = Reporter()

    # check all sections values and order and get the
    # ordered list of notebook files
    chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections,
                                                                special_report_sections,
                                                                custom_report_sections,
                                                                section_order,
                                                                subgroups,
                                                                model_type=None,
                                                                context='rsmsummarize')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation')
    reporter.create_summary_report(configuration,
                                   all_experiments,
                                   csvdir)
예제 #9
0
 def setUp(self):
     self.reporter = Reporter()
예제 #10
0
class TestReporter:
    def setUp(self):
        self.reporter = Reporter()

    def check_section_lists(self, context):
        general_sections = master_section_dict['general'][context]
        special_sections = master_section_dict['special'][context]
        overlap = set(general_sections) & set(special_sections)
        # check that there are general section
        ok_(len(general_sections) > 0)
        # check that there is no overlap between general and special section
        # list
        eq_(len(overlap), 0)

    def test_check_section_lists_rsmtool(self):
        # sanity checks to make sure nothing went wrong when generating
        # master section list
        for context in ['rsmtool', 'rsmeval', 'rsmcompare']:
            yield self.check_section_lists, context

    @raises(ValueError)
    def test_check_section_order_not_enough_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = ['prompt', 'gender']
        section_order = general_sections
        self.reporter.get_ordered_notebook_files(
            general_sections,
            special_sections=special_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups)

    @raises(ValueError)
    def test_check_section_order_extra_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = []
        section_order = general_sections + special_sections + custom_sections + [
            'extra_section'
        ]
        self.reporter.get_ordered_notebook_files(
            general_sections,
            special_sections=special_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups)

    @raises(ValueError)
    def test_check_section_order_wrong_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = []
        section_order = ['extra_section1', 'extra_section2']
        self.reporter.get_ordered_notebook_files(
            general_sections,
            special_sections=special_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups)

    def test_check_section_order(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar']
        section_order = (['foobar'] + special_sections + general_sections)
        self.reporter.check_section_order(
            general_sections + special_sections + custom_sections,
            section_order)

    def test_check_general_section_names_rsmtool(self):
        specified_list = ['data_description', 'preprocessed_features']
        self.reporter.check_section_names(specified_list, 'general')

    @raises(ValueError)
    def test_check_general_section_names_wrong_names_1(self):
        specified_list = ['data_description', 'feature_stats']
        self.reporter.check_section_names(specified_list, 'general')

    def test_check_general_section_names_rsmeval_1(self):
        specified_list = ['data_description', 'evaluation']
        self.reporter.check_section_names(specified_list,
                                          'general',
                                          context='rsmeval')

    @raises(ValueError)
    def test_check_general_section_names_rsmeval_2(self):
        specified_list = ['data_description', 'preprocessed_features']
        self.reporter.check_section_names(specified_list,
                                          'general',
                                          context='rsmeval')

    def test_check_general_section_names_rsmcompare(self):
        specified_list = ['feature_descriptives', 'evaluation']
        self.reporter.check_section_names(specified_list,
                                          'general',
                                          context='rsmcompare')

    @raises(ValueError)
    def test_check_general_section_names_wrong_names_2(self):
        specified_list = ['data_description', 'evaluation']
        self.reporter.check_section_names(specified_list,
                                          'general',
                                          context='rsmcompare')

    def test_determine_chosen_sections_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    def test_determine_chosen_sections_default_general_no_subgroups(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [
            s for s in general_section_list_rsmtool
            if not s.endswith('by_group')
        ]
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(chosen_sections, no_subgroup_list)

    @raises(ValueError)
    def test_determine_chosen_sections_invalid_general(self):
        general_sections = ['data_description', 'foobar']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    @raises(ValueError)
    def test_determine_chosen_sections_no_subgroups(self):
        general_sections = ['data_description', 'data_description_by_group']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    def test_determine_chosen_sections_custom_general(self):
        general_sections = ['data_description', 'evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_default_general_with_special(self):
        general_sections = ['all']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_section_list_rsmtool + special_sections))

    @raises(ValueError)
    def test_determine_chosen_sections_invalid_special(self):
        general_sections = ['all']
        special_sections = ['placeholder_special_section', 'foobar']
        custom_sections = []
        subgroups = ['prompt']
        self.reporter.determine_chosen_sections(general_sections,
                                                special_sections,
                                                custom_sections, subgroups)

    def test_determine_chosen_sections_custom_general_with_special(self):
        general_sections = ['data_description', 'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections))

    def test_determine_chosen_sections_default_general_with_subgroups(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmtool))

    def test_determine_chosen_sections_custom_general_with_special_subgroups_and_custom(
            self):
        general_sections = ['evaluation', 'sysinfo', 'evaluation_by_group']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar.ipynb']
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections, special_sections, custom_sections, subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections + ['foobar']))

    def test_determine_chosen_sections_eval_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmeval')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmeval))

    def test_determine_chosen_sections_eval_custom_general(self):
        general_sections = ['data_description', 'consistency']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmeval')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_eval_default_general_with_no_subgroups(
            self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [
            s for s in general_section_list_rsmeval
            if not s.endswith('by_group')
        ]
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmeval')
        eq_(sorted(chosen_sections), sorted(no_subgroup_list))

    def test_determine_chosen_sections_eval_custom_general_with_special_and_subgroups(
            self):
        general_sections = [
            'data_description', 'consistency', 'data_description_by_group'
        ]
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmeval')
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections))

    def test_determine_chosen_sections_compare_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmcompare))

    def test_determine_chosen_sections_rsmcompare_custom_general(self):
        general_sections = ['feature_descriptives', 'evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_rsmcompare_default_general_with_no_subgroups(
            self):

        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [
            s for s in general_section_list_rsmcompare
            if not s.endswith('by_group')
        ]
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(no_subgroup_list))

    def test_determine_chosen_sections_rsmcompare_custom_general_with_special_and_subgroups(
            self):
        general_sections = ['feature_descriptives', 'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmcompare')
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections))

    def test_determine_chosen_sections_rsmsummarize_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmsummarize')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmsummarize))

    def test_determine_chosen_sections_rsmsummarize_custom_general(self):
        general_sections = ['evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmsummarize')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_compare_custom_general_with_special_subgroups_and_custom(
            self):
        general_sections = ['feature_descriptives', 'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar.ipynb']
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(
            general_sections,
            special_sections,
            custom_sections,
            subgroups,
            context='rsmcompare')
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections + ['foobar']))

    def test_get_ordered_notebook_files_default_rsmtool(self):
        general_sections = ['all']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections, model_type='skll', context='rsmtool')
        no_subgroup_list = [
            s for s in general_section_list_rsmtool
            if not s.endswith('by_group')
        ]
        section_list = ['header'] + no_subgroup_list + ['footer']

        # replace model section with skll_model.
        updated_section_list = [
            'skll_' + sname if sname == 'model' else sname
            for sname in section_list
        ]
        general_section_plus_extension = [
            s + '.ipynb' for s in updated_section_list
        ]
        expected_notebook_files = [
            join(notebook_path, s) for s in general_section_plus_extension
        ]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmtool(self):

        # custom and general sections, custom order and subgroups
        general_sections = [
            'data_description', 'pca', 'data_description_by_group'
        ]
        custom_sections = ['/test_path/custom.ipynb']
        special_sections = ['placeholder_special_section']
        subgroups = ['prompt']
        section_order = [
            'custom', 'data_description', 'pca', 'data_description_by_group',
            'placeholder_special_section'
        ]
        special_notebook_path = notebook_path_dict['special']['rsmtool']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections,
            custom_sections=custom_sections,
            special_sections=special_sections,
            section_order=section_order,
            subgroups=subgroups,
            model_type='skll',
            context='rsmtool')

        expected_notebook_files = ([join(notebook_path, 'header.ipynb')] + [
            '/test_path/custom.ipynb'
        ] + [
            join(notebook_path, s) + '.ipynb'
            for s in ['data_description', 'pca', 'data_description_by_group']
        ] + [join(special_notebook_path, 'placeholder_special_section.ipynb')]
                                   + [join(notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_default_rsmeval(self):
        general_sections = ['all']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections, context='rsmeval')
        no_subgroup_list = [
            s for s in general_section_list_rsmeval
            if not s.endswith('by_group')
        ]
        section_list = ['header'] + no_subgroup_list + ['footer']

        general_section_plus_extension = [
            '{}.ipynb'.format(s) for s in section_list
        ]
        expected_notebook_files = [
            join(notebook_path_dict['general']['rsmeval'], s)
            for s in general_section_plus_extension
        ]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmeval(self):

        # custom and general sections, custom order and subgroups

        general_sections = ['evaluation', 'consistency', 'evaluation_by_group']
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = [
            'evaluation', 'consistency', 'custom', 'evaluation_by_group'
        ]
        notebook_path = notebook_path_dict['general']['rsmeval']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups,
            context='rsmeval')

        expected_notebook_files = (
            [join(notebook_path, 'header.ipynb')] + [
                join(notebook_path, s) + '.ipynb'
                for s in ['evaluation', 'consistency']
            ] + ['/test_path/custom.ipynb'] +
            [join(notebook_path, 'evaluation_by_group.ipynb')] +
            [join(notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_default_rsmcompare(self):
        general_sections = ['all']
        comparison_notebook_path = notebook_path_dict['general']['rsmcompare']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections, context='rsmcompare')
        no_subgroup_list = [
            s for s in general_section_list_rsmcompare
            if not s.endswith('by_group')
        ]
        section_list = ['header'] + no_subgroup_list + ['footer']

        general_section_plus_extension = [s + '.ipynb' for s in section_list]
        expected_notebook_files = [
            join(comparison_notebook_path, s)
            for s in general_section_plus_extension
        ]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmcompare(self):
        # custom and general sections, custom order and subgroups
        general_sections = [
            'feature_descriptives', 'score_distributions', 'features_by_group'
        ]
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = [
            'feature_descriptives', 'score_distributions', 'custom',
            'features_by_group'
        ]
        comparison_notebook_path = notebook_path_dict['general']['rsmcompare']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups,
            context='rsmcompare')

        expected_notebook_files = (
            [join(comparison_notebook_path, 'header.ipynb')] + [
                join(comparison_notebook_path, s) + '.ipynb'
                for s in ['feature_descriptives', 'score_distributions']
            ] + ['/test_path/custom.ipynb'] +
            [join(comparison_notebook_path, 'features_by_group.ipynb')] +
            [join(comparison_notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmsummarize(self):
        # custom and general sections, custom order and subgroups
        general_sections = ['evaluation']
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = ['custom', 'evaluation']
        summary_notebook_path = notebook_path_dict['general']['rsmsummarize']
        notebook_files = self.reporter.get_ordered_notebook_files(
            general_sections,
            custom_sections=custom_sections,
            section_order=section_order,
            subgroups=subgroups,
            context='rsmsummarize')

        expected_notebook_files = (
            [join(summary_notebook_path, 'header.ipynb')] +
            ['/test_path/custom.ipynb'] + [
                join(summary_notebook_path, s) + '.ipynb'
                for s in ['evaluation']
            ] + [join(summary_notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_section_file_map_rsmtool(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(special_sections,
                                                              custom_sections,
                                                              model_type='R')
        eq_(section_file_map['model'], join(notebook_path, 'r_model.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'],
            normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmeval(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(
            special_sections, custom_sections, context='rsmeval')
        eq_(section_file_map['data_description'],
            join(notebook_path, 'data_description.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'],
            normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmcompare(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(
            special_sections, custom_sections, context='rsmcompare')
        eq_(section_file_map['evaluation'],
            join(comparison_notebook_path, 'evaluation.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'],
            normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmsummarize(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(
            special_sections, custom_sections, context='rsmsummarize')
        eq_(section_file_map['evaluation'],
            join(summary_notebook_path, 'evaluation.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'],
            normpath('special_notebook_path/placeholder.ipynb'))
예제 #11
0
def run_evaluation(config_file_or_obj, output_dir):
    """
    Run an `rsmeval` experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmeval')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Make sure prediction file can be located
    if not DataReader.locate_files(configuration['predictions_file'],
                                   configpath):
        raise FileNotFoundError('Error: Predictions file {} '
                                'not found.\n'.format(
                                    configuration['predictions_file']))

    scale_with = configuration.get('scale_with')

    # scale_with can be one of the following:
    # (a) None       : the predictions are assumed to be 'raw' and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'raw', 'raw_trim' and 'raw_trim_round'.
    # (b) 'asis'     : the predictions are assumed to be pre-scaled and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.
    # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled
    #                  before computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.

    # Check whether we want to do scaling
    do_scaling = (scale_with is not None and scale_with != 'asis')

    # The paths to files and names for data container properties
    paths = ['predictions_file']
    names = ['predictions']

    # If we want to do scaling, get the scale file
    if do_scaling:

        # Make sure scale file can be located
        scale_file_location = DataReader.locate_files(scale_with, configpath)
        if not scale_file_location:
            raise FileNotFoundError('Could not find scaling file {}.'
                                    ''.format(scale_file_location))

        paths.append('scale_with')
        names.append('scale')

    # Get the paths, names, and converters for the DataReader
    (file_names, file_paths) = configuration.get_names_and_paths(paths, names)

    file_paths = DataReader.locate_files(file_paths, configpath)

    converters = {'predictions': configuration.get_default_converter()}

    logger.info('Reading predictions: {}.'.format(
        configuration['predictions_file']))

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing predictions.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmeval')

    logger.info('Saving pre-processed predictions and metadata to disk.')
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   new_names_dict={
                                       'pred_test': 'pred_processed',
                                       'test_excluded':
                                       'test_excluded_responses'
                                   },
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    # do the data composition stats
    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmeval(
         processed_container, processed_config)
    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    for_pred_data_container = analyzed_container + processed_container

    # run the analyses on the predictions of the model`
    logger.info('Running analyses on predictions.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         for_pred_data_container, analyzed_config)

    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir, context='rsmeval')
 def setUp(self):
     self.reporter = Reporter()
class TestReporter:

    def setUp(self):
        self.reporter = Reporter()

    def check_section_lists(self, context):
        general_sections = master_section_dict['general'][context]
        special_sections = master_section_dict['special'][context]
        overlap = set(general_sections) & set(special_sections)
        # check that there are general section
        ok_(len(general_sections) > 0)
        # check that there is no overlap between general and special section
        # list
        eq_(len(overlap), 0)

    def test_check_section_lists_rsmtool(self):
        # sanity checks to make sure nothing went wrong when generating
        # master section list
        for context in ['rsmtool', 'rsmeval', 'rsmcompare']:
            yield self.check_section_lists, context

    @raises(ValueError)
    def test_check_section_order_not_enough_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = ['prompt', 'gender']
        section_order = general_sections
        self.reporter.get_ordered_notebook_files(general_sections,
                                                 special_sections=special_sections,
                                                 custom_sections=custom_sections,
                                                 section_order=section_order,
                                                 subgroups=subgroups)

    @raises(ValueError)
    def test_check_section_order_extra_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = []
        section_order = general_sections + special_sections + custom_sections + ['extra_section']
        self.reporter.get_ordered_notebook_files(general_sections,
                                                 special_sections=special_sections,
                                                 custom_sections=custom_sections,
                                                 section_order=section_order,
                                                 subgroups=subgroups)

    @raises(ValueError)
    def test_check_section_order_wrong_sections(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['custom.ipynb']
        subgroups = []
        section_order = ['extra_section1', 'extra_section2']
        self.reporter.get_ordered_notebook_files(general_sections,
                                                 special_sections=special_sections,
                                                 custom_sections=custom_sections,
                                                 section_order=section_order,
                                                 subgroups=subgroups)

    def test_check_section_order(self):
        general_sections = ['evaluation', 'sysinfo']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar']
        section_order = (['foobar'] +
                         special_sections +
                         general_sections)
        self.reporter.check_section_order(general_sections +
                                          special_sections +
                                          custom_sections,
                                          section_order)

    def test_check_general_section_names_rsmtool(self):
        specified_list = ['data_description', 'preprocessed_features']
        self.reporter.check_section_names(specified_list, 'general')

    @raises(ValueError)
    def test_check_general_section_names_wrong_names_1(self):
        specified_list = ['data_description', 'feature_stats']
        self.reporter.check_section_names(specified_list, 'general')

    def test_check_general_section_names_rsmeval_1(self):
        specified_list = ['data_description', 'evaluation']
        self.reporter.check_section_names(specified_list, 'general', context='rsmeval')

    @raises(ValueError)
    def test_check_general_section_names_rsmeval_2(self):
        specified_list = ['data_description', 'preprocessed_features']
        self.reporter.check_section_names(specified_list, 'general', context='rsmeval')

    def test_check_general_section_names_rsmcompare(self):
        specified_list = ['feature_descriptives', 'evaluation']
        self.reporter.check_section_names(specified_list, 'general', context='rsmcompare')

    @raises(ValueError)
    def test_check_general_section_names_wrong_names_2(self):
        specified_list = ['data_description', 'evaluation']
        self.reporter.check_section_names(specified_list, 'general', context='rsmcompare')

    def test_determine_chosen_sections_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    def test_determine_chosen_sections_default_general_no_subgroups(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [s for s in general_section_list_rsmtool
                            if not s.endswith('by_group')]
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(chosen_sections, no_subgroup_list)

    @raises(ValueError)
    def test_determine_chosen_sections_invalid_general(self):
        general_sections = ['data_description', 'foobar']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    @raises(ValueError)
    def test_determine_chosen_sections_no_subgroups(self):
        general_sections = ['data_description', 'data_description_by_group']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(chosen_sections, general_section_list_rsmtool)

    def test_determine_chosen_sections_custom_general(self):
        general_sections = ['data_description', 'evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_default_general_with_special(self):
        general_sections = ['all']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_section_list_rsmtool + special_sections))

    @raises(ValueError)
    def test_determine_chosen_sections_invalid_special(self):
        general_sections = ['all']
        special_sections = ['placeholder_special_section', 'foobar']
        custom_sections = []
        subgroups = ['prompt']
        self.reporter.determine_chosen_sections(general_sections,
                                                special_sections,
                                                custom_sections,
                                                subgroups)

    def test_determine_chosen_sections_custom_general_with_special(self):
        general_sections = ['data_description', 'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_sections + special_sections))

    def test_determine_chosen_sections_default_general_with_subgroups(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_section_list_rsmtool))

    def test_determine_chosen_sections_custom_general_with_special_subgroups_and_custom(self):
        general_sections = ['evaluation', 'sysinfo', 'evaluation_by_group']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar.ipynb']
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups)
        eq_(sorted(chosen_sections),
            sorted(general_sections +
                   special_sections +
                   ['foobar']))

    def test_determine_chosen_sections_eval_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmeval')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmeval))

    def test_determine_chosen_sections_eval_custom_general(self):
        general_sections = ['data_description', 'consistency']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmeval')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_eval_default_general_with_no_subgroups(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [s for s in general_section_list_rsmeval
                            if not s.endswith('by_group')]
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmeval')
        eq_(sorted(chosen_sections), sorted(no_subgroup_list))

    def test_determine_chosen_sections_eval_custom_general_with_special_and_subgroups(self):
        general_sections = ['data_description', 'consistency', 'data_description_by_group']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmeval')
        eq_(sorted(chosen_sections), sorted(general_sections +
                                            special_sections))

    def test_determine_chosen_sections_compare_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmcompare))

    def test_determine_chosen_sections_rsmcompare_custom_general(self):
        general_sections = ['feature_descriptives',
                            'evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_rsmcompare_default_general_with_no_subgroups(self):

        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = []
        no_subgroup_list = [s for s in general_section_list_rsmcompare
                            if not s.endswith('by_group')]
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(no_subgroup_list))

    def test_determine_chosen_sections_rsmcompare_custom_general_with_special_and_subgroups(self):
        general_sections = ['feature_descriptives',
                            'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = []
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_sections +
                                            special_sections))

    def test_determine_chosen_sections_rsmsummarize_default_general(self):
        general_sections = ['all']
        special_sections = []
        custom_sections = []
        subgroups = ['prompt']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmsummarize')
        eq_(sorted(chosen_sections), sorted(general_section_list_rsmsummarize))

    def test_determine_chosen_sections_rsmsummarize_custom_general(self):
        general_sections = ['evaluation']
        special_sections = []
        custom_sections = []
        subgroups = []
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmsummarize')
        eq_(sorted(chosen_sections), sorted(general_sections))

    def test_determine_chosen_sections_compare_custom_general_with_special_subgroups_and_custom(self):
        general_sections = ['feature_descriptives',
                            'evaluation']
        special_sections = ['placeholder_special_section']
        custom_sections = ['foobar.ipynb']
        subgroups = ['prompt', 'gender']
        chosen_sections = self.reporter.determine_chosen_sections(general_sections,
                                                                  special_sections,
                                                                  custom_sections,
                                                                  subgroups,
                                                                  context='rsmcompare')
        eq_(sorted(chosen_sections), sorted(general_sections +
                                            special_sections +
                                            ['foobar']))

    def test_get_ordered_notebook_files_default_rsmtool(self):
        general_sections = ['all']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  model_type='skll',
                                                                  context='rsmtool')
        no_subgroup_list = [s for s in general_section_list_rsmtool
                            if not s.endswith('by_group')]
        section_list = ['header'] + no_subgroup_list + ['footer']

        # replace model section with skll_model.
        updated_section_list = ['skll_' + sname if sname == 'model' else sname for sname in section_list]
        general_section_plus_extension = [s + '.ipynb' for s in updated_section_list]
        expected_notebook_files = [join(notebook_path, s)
                                   for s in general_section_plus_extension]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmtool(self):

        # custom and general sections, custom order and subgroups
        general_sections = ['data_description', 'pca', 'data_description_by_group']
        custom_sections = ['/test_path/custom.ipynb']
        special_sections = ['placeholder_special_section']
        subgroups = ['prompt']
        section_order = ['custom',
                         'data_description',
                         'pca',
                         'data_description_by_group',
                         'placeholder_special_section']
        special_notebook_path = notebook_path_dict['special']['rsmtool']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  custom_sections=custom_sections,
                                                                  special_sections=special_sections,
                                                                  section_order=section_order,
                                                                  subgroups=subgroups,
                                                                  model_type='skll',
                                                                  context='rsmtool')

        expected_notebook_files = ([join(notebook_path, 'header.ipynb')] +
                                   ['/test_path/custom.ipynb'] +
                                   [join(notebook_path, s) + '.ipynb' for s in ['data_description',
                                                                                'pca',
                                                                                'data_description_by_group']] +
                                   [join(special_notebook_path, 'placeholder_special_section.ipynb')] +
                                   [join(notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_default_rsmeval(self):
        general_sections = ['all']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  context='rsmeval')
        no_subgroup_list = [s for s in general_section_list_rsmeval
                            if not s.endswith('by_group')]
        section_list = ['header'] + no_subgroup_list + ['footer']

        general_section_plus_extension = ['{}.ipynb'.format(s) for s in section_list]
        expected_notebook_files = [join(notebook_path_dict['general']['rsmeval'], s)
                                   for s in
                                   general_section_plus_extension]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmeval(self):

        # custom and general sections, custom order and subgroups

        general_sections = ['evaluation', 'consistency', 'evaluation_by_group']
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = ['evaluation',
                         'consistency',
                         'custom',
                         'evaluation_by_group']
        notebook_path = notebook_path_dict['general']['rsmeval']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  custom_sections=custom_sections,
                                                                  section_order=section_order,
                                                                  subgroups=subgroups,
                                                                  context='rsmeval')

        expected_notebook_files = ([join(notebook_path, 'header.ipynb')] +
                                   [join(notebook_path, s) + '.ipynb' for s in ['evaluation',
                                                                                'consistency']] +
                                   ['/test_path/custom.ipynb'] +
                                   [join(notebook_path, 'evaluation_by_group.ipynb')] +
                                   [join(notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_default_rsmcompare(self):
        general_sections = ['all']
        comparison_notebook_path = notebook_path_dict['general']['rsmcompare']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  context='rsmcompare')
        no_subgroup_list = [s for s in general_section_list_rsmcompare
                            if not s.endswith('by_group')]
        section_list = ['header'] + no_subgroup_list + ['footer']

        general_section_plus_extension = [s + '.ipynb' for s in section_list]
        expected_notebook_files = [join(comparison_notebook_path, s)
                                   for s in general_section_plus_extension]
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmcompare(self):
        # custom and general sections, custom order and subgroups
        general_sections = ['feature_descriptives',
                            'score_distributions',
                            'features_by_group']
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = ['feature_descriptives',
                         'score_distributions',
                         'custom',
                         'features_by_group']
        comparison_notebook_path = notebook_path_dict['general']['rsmcompare']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  custom_sections=custom_sections,
                                                                  section_order=section_order,
                                                                  subgroups=subgroups,
                                                                  context='rsmcompare')

        expected_notebook_files = ([join(comparison_notebook_path, 'header.ipynb')] +
                                   [join(comparison_notebook_path, s) + '.ipynb' for s in ['feature_descriptives',
                                                                                           'score_distributions']] +
                                   ['/test_path/custom.ipynb'] +
                                   [join(comparison_notebook_path, 'features_by_group.ipynb')] +
                                   [join(comparison_notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_ordered_notebook_files_custom_rsmsummarize(self):
        # custom and general sections, custom order and subgroups
        general_sections = ['evaluation']
        custom_sections = ['/test_path/custom.ipynb']
        subgroups = ['prompt']
        section_order = ['custom',
                         'evaluation']
        summary_notebook_path = notebook_path_dict['general']['rsmsummarize']
        notebook_files = self.reporter.get_ordered_notebook_files(general_sections,
                                                                  custom_sections=custom_sections,
                                                                  section_order=section_order,
                                                                  subgroups=subgroups,
                                                                  context='rsmsummarize')

        expected_notebook_files = ([join(summary_notebook_path, 'header.ipynb')] +
                                   ['/test_path/custom.ipynb'] +
                                   [join(summary_notebook_path, s) + '.ipynb' for s in ['evaluation']] +
                                   [join(summary_notebook_path, 'footer.ipynb')])
        eq_(notebook_files, expected_notebook_files)

    def test_get_section_file_map_rsmtool(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(special_sections,
                                                              custom_sections,
                                                              model_type='R')
        eq_(section_file_map['model'], join(notebook_path, 'r_model.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'], normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmeval(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(special_sections,
                                                              custom_sections,
                                                              context='rsmeval')
        eq_(section_file_map['data_description'], join(notebook_path, 'data_description.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'], normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmcompare(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(special_sections,
                                                              custom_sections,
                                                              context='rsmcompare')
        eq_(section_file_map['evaluation'], join(comparison_notebook_path, 'evaluation.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'], normpath('special_notebook_path/placeholder.ipynb'))

    def test_get_section_file_map_rsmsummarize(self):
        special_sections = ['placeholder']
        custom_sections = ['/path/notebook.ipynb']
        section_file_map = self.reporter.get_section_file_map(special_sections,
                                                              custom_sections,
                                                              context='rsmsummarize')
        eq_(section_file_map['evaluation'], join(summary_notebook_path, 'evaluation.ipynb'))
        eq_(section_file_map['notebook'], '/path/notebook.ipynb')
        eq_(section_file_map['placeholder'], normpath('special_notebook_path/placeholder.ipynb'))
예제 #14
0
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)
def run_summary(config_file_or_obj, output_dir):
    """
    Run rsmsummarize experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))

    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                          context='rsmsummarize')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    # get the list of the experiment dirs
    experiment_dirs = configuration['experiment_dirs']

    # check the experiment dirs and assemble the list of csvdir and jsons
    all_experiments = []
    for experiment_dir in experiment_dirs:
        experiments = check_experiment_dir(experiment_dir, configpath)
        all_experiments.extend(experiments)

    # get the subgroups if any
    # Note: at the moment no comparison are reported for subgroups.
    # this option is added to the code to make it easier to add
    # subgroup comparisons in future versions
    subgroups = configuration.get('subgroups')

    general_report_sections = configuration['general_sections']

    # get any special sections that the user might have specified
    special_report_sections = configuration['special_sections']

    # get any custom sections and locate them to make sure
    # that they exist, otherwise raise an exception
    custom_report_section_paths = configuration['custom_sections']
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths,
                                                                 configpath)
    else:
        custom_report_sections = []

    section_order = configuration['section_order']

    # Initialize reporter
    reporter = Reporter()

    # check all sections values and order and get the
    # ordered list of notebook files
    chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections,
                                                                special_report_sections,
                                                                custom_report_sections,
                                                                section_order,
                                                                subgroups,
                                                                model_type=None,
                                                                context='rsmsummarize')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation')
    reporter.create_summary_report(configuration,
                                   all_experiments,
                                   csvdir)