def test_data_container_save_wrong_format(self):

        data_sets = [{'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)),
                                                                columns=['A', 'B'])},
                     {'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)),
                                                                columns=['A', 'B', 'C'])}]

        container = DataContainer(data_sets)

        directory = 'temp_directory_container_save_wrong_format_xyz'

        writer = DataWriter()
        writer.write_experiment_output(directory,
                                       container,
                                       dataframe_names=['dataset1'],
                                       file_format='html')
Пример #2
0
    def test_data_container_save_wrong_format(self):

        data_sets = [{'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)),
                                                                columns=['A', 'B'])},
                     {'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)),
                                                                columns=['A', 'B', 'C'])}]

        container = DataContainer(data_sets)

        directory = 'temp_directory_container_save_wrong_format_xyz'

        writer = DataWriter()
        writer.write_experiment_output(directory,
                                       container,
                                       dataframe_names=['dataset1'],
                                       file_format='html')
    def test_data_container_save_files_with_id(self):

        data_sets = [{'name': 'dataset1', 'frame': pd.DataFrame(np.random.normal(size=(100, 2)),
                                                                columns=['A', 'B'])},
                     {'name': 'dataset2', 'frame': pd.DataFrame(np.random.normal(size=(120, 3)),
                                                                columns=['A', 'B', 'C'])}]

        container = DataContainer(data_sets)

        directory = 'temp_directory_save_files_with_id_xyz'
        os.makedirs(directory, exist_ok=True)

        writer = DataWriter('test')
        for file_type in ['json', 'csv', 'xlsx']:

            if file_type != 'json':

                writer.write_experiment_output(directory,
                                               container,
                                               dataframe_names=['dataset1'],
                                               file_format=file_type)
            else:
                writer.write_experiment_output(directory,
                                               container,
                                               new_names_dict={'dataset1': 'aaa'},
                                               dataframe_names=['dataset1'],
                                               file_format=file_type)

        aaa_json = pd.read_json(os.path.join(directory, 'test_aaa.json'))
        ds_1_csv = pd.read_csv(os.path.join(directory, 'test_dataset1.csv'))
        ds_1_xls = pd.read_excel(os.path.join(directory, 'test_dataset1.xlsx'))

        output_dir = os.listdir(directory)
        rmtree(directory)
        assert sorted(output_dir) == sorted(['test_aaa.json',
                                             'test_dataset1.csv',
                                             'test_dataset1.xlsx'])

        assert_frame_equal(container.dataset1, aaa_json)
        assert_frame_equal(container.dataset1, ds_1_csv)
        assert_frame_equal(container.dataset1, ds_1_xls)
Пример #4
0
def write_fairness_results(fit_dictionary, fairness_container, group,
                           output_dir, experiment_id, file_format):
    """
    Save the results of fairness analysis to disk.

    Parameters
    ----------
    fit_dictionary: dict
        A dictionary of fitted models generated by ``get_fairness_analyses()``.
    fairness_container: container.DataContainer
        A data container with the results of fairness analysis generated by
        ``get_fairness_analyses()``.
    group: str
        The subgroup considered in this analysis.
    output_dir: str
        The directory where the results will be saved.
    experiment_id: str
        Experiment ID.
    file_format: str
        File format to use for data files.
    """
    # let's first save model files and summaries
    for model in fit_dictionary:
        fit = fit_dictionary[model]

        ols_file = join(output_dir,
                        '{}_{}_by_{}.ols'.format(experiment_id, model, group))
        summary_file = join(
            output_dir,
            '{}_{}_by_{}_ols_summary.txt'.format(experiment_id, model, group))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

    # Now let's write out the content of the data container
    writer = DataWriter(experiment_id)
    writer.write_experiment_output(output_dir,
                                   fairness_container,
                                   file_format=file_format,
                                   index=True)
Пример #5
0
    def test_data_container_save_files_with_id(self):

        data_sets = [{
            'name':
            'dataset1',
            'frame':
            pd.DataFrame(np.random.normal(size=(100, 2)), columns=['A', 'B'])
        }, {
            'name':
            'dataset2',
            'frame':
            pd.DataFrame(np.random.normal(size=(120, 3)),
                         columns=['A', 'B', 'C'])
        }]

        container = DataContainer(data_sets)

        directory = 'temp_directory_save_files_with_id_xyz'
        os.makedirs(directory, exist_ok=True)

        writer = DataWriter('test')
        for file_type in ['json', 'csv', 'xlsx']:

            if file_type != 'json':

                writer.write_experiment_output(directory,
                                               container,
                                               dataframe_names=['dataset1'],
                                               file_format=file_type)
            else:
                writer.write_experiment_output(
                    directory,
                    container,
                    new_names_dict={'dataset1': 'aaa'},
                    dataframe_names=['dataset1'],
                    file_format=file_type)

        aaa_json = pd.read_json(os.path.join(directory, 'test_aaa.json'))
        ds_1_csv = pd.read_csv(os.path.join(directory, 'test_dataset1.csv'))
        ds_1_xls = pd.read_excel(os.path.join(directory, 'test_dataset1.xlsx'))

        output_dir = os.listdir(directory)
        rmtree(directory)
        assert sorted(output_dir) == sorted(
            ['test_aaa.json', 'test_dataset1.csv', 'test_dataset1.xlsx'])

        assert_frame_equal(container.dataset1, aaa_json)
        assert_frame_equal(container.dataset1, ds_1_csv)
        assert_frame_equal(container.dataset1, ds_1_xls)
Пример #6
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                   context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'
                                ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '
                                'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(experiment_output_dir,
                           '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features',
                  'feature_info',
                  'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(join(experiment_output_dir,
                                        '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(feats_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['features_processed'],
                                       new_names_dict={'features_processed':
                                                       feats_filename},
                                       file_format=file_format)

    if (output_file.lower().endswith('.csv') or
            output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(output_dir,
                                   processed_container,
                                   include_experiment_id=False,
                                   dataframe_names=['predictions_with_metadata'],
                                   new_names_dict={'predictions_with_metadata':
                                                   filename},
                                   file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(join(output_dir,
                                                                  '{}_excluded_responses.csv'
                                                                  ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={'excluded':
                                                       '{}_excluded_responses'
                                                       ''.format(filename)},
                                       file_format=file_format)
Пример #8
0
def run_evaluation(config_file_or_obj, output_dir):
    """
    Run an `rsmeval` experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmeval')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Make sure prediction file can be located
    if not DataReader.locate_files(configuration['predictions_file'],
                                   configpath):
        raise FileNotFoundError('Error: Predictions file {} '
                                'not found.\n'.format(
                                    configuration['predictions_file']))

    scale_with = configuration.get('scale_with')

    # scale_with can be one of the following:
    # (a) None       : the predictions are assumed to be 'raw' and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'raw', 'raw_trim' and 'raw_trim_round'.
    # (b) 'asis'     : the predictions are assumed to be pre-scaled and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.
    # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled
    #                  before computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.

    # Check whether we want to do scaling
    do_scaling = (scale_with is not None and scale_with != 'asis')

    # The paths to files and names for data container properties
    paths = ['predictions_file']
    names = ['predictions']

    # If we want to do scaling, get the scale file
    if do_scaling:

        # Make sure scale file can be located
        scale_file_location = DataReader.locate_files(scale_with, configpath)
        if not scale_file_location:
            raise FileNotFoundError('Could not find scaling file {}.'
                                    ''.format(scale_file_location))

        paths.append('scale_with')
        names.append('scale')

    # Get the paths, names, and converters for the DataReader
    (file_names, file_paths) = configuration.get_names_and_paths(paths, names)

    file_paths = DataReader.locate_files(file_paths, configpath)

    converters = {'predictions': configuration.get_default_converter()}

    logger.info('Reading predictions: {}.'.format(
        configuration['predictions_file']))

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing predictions.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmeval')

    logger.info('Saving pre-processed predictions and metadata to disk.')
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   new_names_dict={
                                       'pred_test': 'pred_processed',
                                       'test_excluded':
                                       'test_excluded_responses'
                                   },
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    # do the data composition stats
    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmeval(
         processed_container, processed_config)
    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    for_pred_data_container = analyzed_container + processed_container

    # run the analyses on the predictions of the model`
    logger.info('Running analyses on predictions.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         for_pred_data_container, analyzed_config)

    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir, context='rsmeval')
Пример #9
0
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)
Пример #10
0
def compute_and_save_predictions(config_file_or_obj,
                                 output_file,
                                 feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(
        config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'],
                                             configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'
            ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(
        experiment_output_dir,
        '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features', 'feature_info', 'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(
        kwargs_dict={'feature_info': {
            'index_col': 0
        }})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(
            feats_dir,
            processed_container,
            include_experiment_id=False,
            dataframe_names=['features_processed'],
            new_names_dict={'features_processed': feats_filename},
            file_format=file_format)

    if (output_file.lower().endswith('.csv')
            or output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(
        output_dir,
        processed_container,
        include_experiment_id=False,
        dataframe_names=['predictions_with_metadata'],
        new_names_dict={'predictions_with_metadata': filename},
        file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(
            join(output_dir, '{}_excluded_responses.csv'
                 ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={
                                           'excluded':
                                           '{}_excluded_responses'
                                           ''.format(filename)
                                       },
                                       file_format=file_format)