def test_run_experiment_lr_with_object(): # basic experiment with a LinearRegression model source = 'lr-object' experiment_id = 'lr_object' config_file = join(rsmtool_test_dir, 'data', 'experiments', source, '{}.json'.format(experiment_id)) config_dict = { "train_file": "../../files/train.csv", "id_column": "ID", "use_scaled_predictions": True, "test_label_column": "score", "train_label_column": "score", "test_file": "../../files/test.csv", "trim_max": 6, "features": "features.csv", "trim_min": 1, "model": "LinearRegression", "experiment_id": "lr_object", "description": "Using all features with an LinearRegression model." } config_parser = ConfigurationParser() config_parser.load_config_from_dict(config_dict) config_obj = config_parser.normalize_validate_and_process_config() config_obj = config_file do_run_experiment(source, experiment_id, config_obj) output_dir = join('test_outputs', source, 'output') expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output') html_report = join('test_outputs', source, 'report', '{}_report.html'.format(experiment_id)) csv_files = glob(join(output_dir, '*.csv')) for csv_file in csv_files: csv_filename = basename(csv_file) expected_csv_file = join(expected_output_dir, csv_filename) if exists(expected_csv_file): yield check_file_output, csv_file, expected_csv_file yield check_generated_output, csv_files, experiment_id, 'rsmtool' yield check_scaled_coefficients, source, experiment_id yield check_report, html_report
def test_validate_config_too_few_experiment_names(self): data = { 'summary_id': 'summary', 'experiment_dirs': ["dir1", "dir2", "dir3"], 'experiment_names': ['exp1', 'exp2'] } _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
def test_run_experiment_lr_with_object(): # basic experiment with a LinearRegression model source = 'lr-object' experiment_id = 'lr_object' config_file = join(rsmtool_test_dir, 'data', 'experiments', source, '{}.json'.format(experiment_id)) config_dict = {"train_file": "../../files/train.csv", "id_column": "ID", "use_scaled_predictions": True, "test_label_column": "score", "train_label_column": "score", "test_file": "../../files/test.csv", "trim_max": 6, "features": "features.csv", "trim_min": 1, "model": "LinearRegression", "experiment_id": "lr_object", "description": "Using all features with an LinearRegression model."} config_parser = ConfigurationParser() config_parser.load_config_from_dict(config_dict) config_obj = config_parser.normalize_validate_and_process_config() config_obj = config_file do_run_experiment(source, experiment_id, config_obj) output_dir = join('test_outputs', source, 'output') expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output') html_report = join('test_outputs', source, 'report', '{}_report.html'.format(experiment_id)) csv_files = glob(join(output_dir, '*.csv')) for csv_file in csv_files: csv_filename = basename(csv_file) expected_csv_file = join(expected_output_dir, csv_filename) if exists(expected_csv_file): yield check_file_output, csv_file, expected_csv_file yield check_generated_output, csv_files, experiment_id, 'rsmtool' yield check_scaled_coefficients, source, experiment_id yield check_report, html_report
def test_validate_config_experiment_id_1(self): data = { 'experiment_id': 'test experiment', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression' } _ = ConfigurationParser.validate_config(data)
def test_validate_config_experiment_id_2(self): data = { 'experiment_id': 'test experiment', 'predictions_file': 'data/foo', 'system_score_column': 'h1', 'trim_min': 1, 'trim_max': 5 } _ = ConfigurationParser.validate_config(data, context='rsmeval')
def test_validate_config_experiment_id_4(self): data = { 'comparison_id': 'old vs new', 'experiment_id_old': 'old experiment', 'experiment_dir_old': 'data/old', 'experiment_id_new': 'new_experiment', 'experiment_dir_new': 'data/new' } _ = ConfigurationParser.validate_config(data, context='rsmcompare')
def test_validate_config_min_responses_but_no_candidate(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'min_responses_per_candidate': 5 } _ = ConfigurationParser.validate_config(data)
def test_invalid_skll_objective(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'LinearSVR', 'skll_objective': 'squared_error' } _ = ConfigurationParser.validate_config(data)
def test_builtin_model_for_expected_scores(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'NNLR', 'predict_expected_scores': 'true' } _ = ConfigurationParser.validate_config(data)
def test_validate_config_unknown_fields(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'LinearRegression', 'output': 'foobar' } _ = ConfigurationParser.validate_config(data)
def test_validate_config_experiment_id_9(self): data = { 'summary_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'experiment_dirs': [] } _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
def test_validate_config_too_few_subgroup_keys(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'subgroups': ['L1', 'L2'], 'min_n_per_group': { "L1": 100 } } _ = ConfigurationParser.validate_config(data)
def test_process_fields_rsmsummarize(self): data = { 'summary_id': 'summary', 'experiment_dirs': 'home/dir1, home/dir2, home/dir3', 'experiment_names': 'exp1, exp2, exp3' } newdata = ConfigurationParser.process_config(data) assert_array_equal(newdata['experiment_dirs'], ['home/dir1', 'home/dir2', 'home/dir3']) assert_array_equal(newdata['experiment_names'], ['exp1', 'exp2', 'exp3'])
def test_validate_config_min_n_without_subgroups(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'min_n_per_group': { "L1": 100, "L2": 50 } } _ = ConfigurationParser.validate_config(data)
def test_run_experiment_lr_summary_with_object(): # basic rsmsummarize experiment comparing several rsmtool experiments source = 'lr-self-summary-object' config_file = join(rsmtool_test_dir, 'data', 'experiments', source, 'rsmsummarize.json') config_dict = { "summary_id": "model_comparison", "experiment_dirs": ["lr-subgroups", "lr-subgroups", "lr-subgroups"], "description": "Comparison of rsmtool experiment with itself." } config_parser = ConfigurationParser() config_parser.load_config_from_dict(config_dict) config_obj = config_parser.normalize_validate_and_process_config( context='rsmsummarize') config_obj = config_file do_run_summary(source, config_obj) html_report = join('test_outputs', source, 'report', 'model_comparison_report.html') output_dir = join('test_outputs', source, 'output') expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output') csv_files = glob(join(output_dir, '*.csv')) for csv_file in csv_files: csv_filename = basename(csv_file) expected_csv_file = join(expected_output_dir, csv_filename) if exists(expected_csv_file): yield check_file_output, csv_file, expected_csv_file yield check_report, html_report
def test_process_fields_with_integer(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'empWt', 'use_scaled_predictions': 'True', 'feature_prefix': '1gram, 2gram', 'subgroups': 'native language, GPA_range', 'exclude_zero_scores': 1 } _ = ConfigurationParser.process_config(data)
def test_validate_config_unspecified_fields(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression' } newdata = ConfigurationParser.validate_config(data) assert_equal(newdata['id_column'], 'spkitemid') assert_equal(newdata['use_scaled_predictions'], False) assert_equal(newdata['select_transformations'], False) assert_array_equal(newdata['general_sections'], ['all']) assert_equal(newdata['description'], '')
def test_validate_config_warning_feature_file_and_transformations(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'select_transformations': True, 'features': 'some_file.csv' } with warnings.catch_warnings(record=True) as warning_list: _ = ConfigurationParser.validate_config(data) eq_(len(warning_list), 1) ok_(issubclass(warning_list[0].category, UserWarning))
def test_validate_config_numeric_subgroup_threshold(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'subgroups': ['L2', 'L1'], 'min_n_per_group': 100 } newdata = ConfigurationParser.validate_config(data) eq_(type(newdata['min_n_per_group']), dict) assert_equal(newdata['min_n_per_group']['L1'], 100) assert_equal(newdata['min_n_per_group']['L2'], 100)
def test_validate_config_warning_feature_list_and_transformations(self): # this should no show warnings data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'select_transformations': True, 'features': ['feature1', 'feature2'] } with warnings.catch_warnings(record=True) as warning_list: _ = ConfigurationParser.validate_config(data) eq_(len(warning_list), 0)
def test_run_experiment_lr_summary_with_object(): # basic rsmsummarize experiment comparing several rsmtool experiments source = 'lr-self-summary-object' config_file = join(rsmtool_test_dir, 'data', 'experiments', source, 'rsmsummarize.json') config_dict = {"summary_id": "model_comparison", "experiment_dirs": ["lr-subgroups", "lr-subgroups", "lr-subgroups"], "description": "Comparison of rsmtool experiment with itself."} config_parser = ConfigurationParser() config_parser.load_config_from_dict(config_dict) config_obj = config_parser.normalize_validate_and_process_config(context='rsmsummarize') config_obj = config_file do_run_summary(source, config_obj) html_report = join('test_outputs', source, 'report', 'model_comparison_report.html') output_dir = join('test_outputs', source, 'output') expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output') csv_files = glob(join(output_dir, '*.csv')) for csv_file in csv_files: csv_filename = basename(csv_file) expected_csv_file = join(expected_output_dir, csv_filename) if exists(expected_csv_file): yield check_file_output, csv_file, expected_csv_file yield check_report, html_report
def test_validate_config_experiment_id_5(self): data = { 'experiment_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression' } _ = ConfigurationParser.validate_config(data)
def test_process_fields(self): data = { 'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'empWt', 'use_scaled_predictions': 'True', 'subgroups': 'native language, GPA_range', 'exclude_zero_scores': 'false' } newdata = ConfigurationParser.process_config(data) assert_array_equal(newdata['subgroups'], ['native language', 'GPA_range']) eq_(type(newdata['use_scaled_predictions']), bool) eq_(newdata['use_scaled_predictions'], True) eq_(newdata['exclude_zero_scores'], False)
def test_validate_config_experiment_id_7(self): data = { 'comparison_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'experiment_id_old': 'old_experiment', 'experiment_dir_old': 'data/old', 'experiment_id_new': 'new_experiment', 'experiment_dir_new': 'data/new' } _ = ConfigurationParser.validate_config(data, context='rsmcompare')
def test_validate_config_missing_fields(self): data = {'experiment_id': 'test'} _ = ConfigurationParser.validate_config(data)
def test_validate_config_experiment_id_8(self): data = {'summary_id': 'model summary', 'experiment_dirs': []} _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None): """ Run ``rsmpredict`` with given configuration file and generate predictions (and, optionally, pre-processed feature values). Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the output directory for saving files. feats_file (optional): str Path to the output file for saving preprocessed feature values. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) config = parser.read_normalize_validate_and_process_config(config_file_or_obj, context='rsmpredict') # get the directory where the config file lives configpath = dirname(config_file_or_obj) else: config = config_file_or_obj if config.filepath is not None: configpath = dirname(config.filepath) else: configpath = os.getcwd() # get the experiment ID experiment_id = config['experiment_id'] # Get output format file_format = config.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(experiment_id) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = DataReader.locate_files(config['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist' ''.format(config['input_features_file'])) experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.' ''.format(config['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError('The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError('The directory {} does not contain any rsmtool models.' ''.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError('{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format(experiment_output_dir, expected_file_name)) # model_files = glob.glob(join(experiment_output_dir, '*.model')) # if not model_files: # raise FileNotFoundError('The directory {} does not contain any rsmtool models. ' # ''.format(experiment_output_dir)) logger.info('Reading input files.') feature_info = join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)) post_processing = join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)) file_paths = [input_features_file, feature_info, post_processing] file_names = ['input_features', 'feature_info', 'postprocessing_params'] converters = {'input_features': config.get_default_converter()} # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}}) # load the Modeler to generate the predictions model = Modeler.load_from_file(join(experiment_output_dir, '{}.model'.format(experiment_id))) # Add the model to the configuration object config['model'] = model # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(config, data_container, context='rsmpredict') # save the pre-processed features to disk if we were asked to if feats_file is not None: logger.info('Saving pre-processed feature values to {}'.format(feats_file)) feats_dir = dirname(feats_file) # create any directories needed for the output file os.makedirs(feats_dir, exist_ok=True) _, feats_filename = split(feats_file) feats_filename, _ = splitext(feats_filename) # Write out files writer.write_experiment_output(feats_dir, processed_container, include_experiment_id=False, dataframe_names=['features_processed'], new_names_dict={'features_processed': feats_filename}, file_format=file_format) if (output_file.lower().endswith('.csv') or output_file.lower().endswith('.xlsx')): output_dir = dirname(output_file) _, filename = split(output_file) filename, _ = splitext(filename) else: output_dir = output_file filename = 'predictions_with_metadata' # create any directories needed for the output file os.makedirs(output_dir, exist_ok=True) # save the predictions to disk logger.info('Saving predictions.') # Write out files writer.write_experiment_output(output_dir, processed_container, include_experiment_id=False, dataframe_names=['predictions_with_metadata'], new_names_dict={'predictions_with_metadata': filename}, file_format=file_format) # save excluded responses to disk if not processed_container.excluded.empty: # save the predictions to disk logger.info('Saving excluded responses to {}'.format(join(output_dir, '{}_excluded_responses.csv' ''.format(filename)))) # Write out files writer.write_experiment_output(output_dir, processed_container, include_experiment_id=False, dataframe_names=['excluded'], new_names_dict={'excluded': '{}_excluded_responses' ''.format(filename)}, file_format=file_format)
def run_summary(config_file_or_obj, output_dir): """ Run rsmsummarize experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) os.makedirs(csvdir, exist_ok=True) os.makedirs(figdir, exist_ok=True) os.makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj, context='rsmsummarize') # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = os.getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # get the list of the experiment dirs experiment_dirs = configuration['experiment_dirs'] # Get experiment names if any experiment_names = configuration.get('experiment_names') experiment_names = experiment_names if experiment_names else [None] * len(experiment_dirs) dirs_with_names = zip(experiment_dirs, experiment_names) # check the experiment dirs and assemble the list of csvdir and jsons all_experiments = [] for (experiment_dir, experiment_name) in dirs_with_names: experiments = check_experiment_dir(experiment_dir, experiment_name, configpath) all_experiments.extend(experiments) # get the subgroups if any # Note: at the moment no comparison are reported for subgroups. # this option is added to the code to make it easier to add # subgroup comparisons in future versions subgroups = configuration.get('subgroups') general_report_sections = configuration['general_sections'] # get any special sections that the user might have specified special_report_sections = configuration['special_sections'] # get any custom sections and locate them to make sure # that they exist, otherwise raise an exception custom_report_section_paths = configuration['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = configuration['section_order'] # Initialize reporter reporter = Reporter() # check all sections values and order and get the # ordered list of notebook files chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmsummarize') # add chosen notebook files to configuration configuration['chosen_notebook_files'] = chosen_notebook_files # now generate the comparison report logger.info('Starting report generation') reporter.create_summary_report(configuration, all_experiments, csvdir)
class TestConfigurationParser: def setUp(self): self.parser = ConfigurationParser() def test_normalize_config(self): data = {'expID': 'experiment_1', 'train': 'data/rsmtool_smTrain.csv', 'LRmodel': 'empWt', 'feature': 'feature/feature_list.json', 'description': 'A sample model with 9 features ' 'trained using average score and tested using r1.', 'test': 'data/rsmtool_smEval.csv', 'train.lab': 'sc1', 'crossvalidate': 'yes', 'test.lab': 'r1', 'scale': 'scale'} with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) newdata = self.parser.normalize_config() ok_('experiment_id' in newdata.keys()) assert_equal(newdata['experiment_id'], 'experiment_1') assert_equal(newdata['use_scaled_predictions'], True) # test for non-standard scaling value data = {'expID': 'experiment_1', 'train': 'data/rsmtool_smTrain.csv', 'LRmodel': 'LinearRegression', 'scale': 'Yes'} with warnings.catch_warnings(): # Add data to `ConfigurationParser` object self.parser._config = data warnings.filterwarnings('ignore', category=DeprecationWarning) assert_raises(ValueError, self.parser.normalize_config) # test when no scaling is specified data = {'expID': 'experiment_1', 'train': 'data/rsmtool_smTrain.csv', 'LRmodel': 'LinearRegression', 'feature': 'feature/feature_list.json', 'description': 'A sample model with 9 features ' 'trained using average score and tested using r1.', 'test': 'data/rsmtool_smEval.csv', 'train.lab': 'sc1', 'crossvalidate': 'yes', 'test.lab': 'r1'} with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) # Add data to `ConfigurationParser` object self.parser._config = data newdata = self.parser.normalize_config() ok_('use_scaled_predictions' not in newdata.keys()) @raises(ValueError) def test_validate_config_missing_fields(self): data = {'expID': 'test'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_validate_config_min_responses_but_no_candidate(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression', 'min_responses_per_candidate': 5} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() def test_validate_config_unspecified_fields(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression'} # Add data to `ConfigurationParser` object self.parser._config = data newdata = self.parser.validate_config() assert_equal(newdata['id_column'], 'spkitemid') assert_equal(newdata['use_scaled_predictions'], False) assert_equal(newdata['select_transformations'], False) assert_equal(newdata['general_sections'], 'all') assert_equal(newdata['description'], '') @raises(ValueError) def test_validate_config_unknown_fields(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'LinearRegression', 'output': 'foobar'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_validate_config_experiment_id_1(self): data = {'experiment_id': 'test experiment', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_validate_config_experiment_id_2(self): data = {'experiment_id': 'test experiment', 'predictions_file': 'data/foo', 'system_score_column': 'h1', 'trim_min': 1, 'trim_max': 5} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmeval') @raises(ValueError) def test_validate_config_experiment_id_3(self): data = {'comparison_id': 'old vs new', 'experiment_id_old': 'old_experiment', 'experiment_dir_old': 'data/old', 'experiment_id_new': 'new_experiment', 'experiment_dir_new': 'data/new'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmcompare') @raises(ValueError) def test_validate_config_experiment_id_4(self): data = {'comparison_id': 'old vs new', 'experiment_id_old': 'old experiment', 'experiment_dir_old': 'data/old', 'experiment_id_new': 'new_experiment', 'experiment_dir_new': 'data/new'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmcompare') @raises(ValueError) def test_validate_config_experiment_id_5(self): data = {'experiment_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'model': 'LinearRegression'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_validate_config_experiment_id_6(self): data = {'experiment_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'predictions_file': 'data/foo', 'system_score_column': 'h1', 'trim_min': 1, 'trim_max': 5} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmeval') @raises(ValueError) def test_validate_config_experiment_id_7(self): data = {'comparison_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'experiment_id_old': 'old_experiment', 'experiment_dir_old': 'data/old', 'experiment_id_new': 'new_experiment', 'experiment_dir_new': 'data/new'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmcompare') @raises(ValueError) def test_validate_config_experiment_id_8(self): data = {'summary_id': 'model summary', 'experiment_dirs': []} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmsummarize') @raises(ValueError) def test_validate_config_experiment_id_9(self): data = {'summary_id': 'this_is_a_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_really_really_really_really_' 'really_really_really_long_id', 'experiment_dirs': []} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config(context='rsmsummarize') def test_process_fields(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'empWt', 'use_scaled_predictions': 'True', 'feature_prefix': '1gram, 2gram', 'subgroups': 'native language, GPA_range', 'exclude_zero_scores': 'false'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) newdata = self.parser.validate_config(inplace=False) # Add data to `ConfigurationParser` object self.parser._config = newdata newdata = self.parser.process_config(inplace=False) assert_array_equal(newdata['feature_prefix'], ['1gram', '2gram']) assert_array_equal(newdata['subgroups'], ['native language', 'GPA_range']) eq_(type(newdata['use_scaled_predictions']), bool) eq_(newdata['use_scaled_predictions'], True) eq_(newdata['exclude_zero_scores'], False) @raises(ValueError) def test_process_fields_with_non_boolean(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'empWt', 'use_scaled_predictions': 'True', 'feature_prefix': '1gram, 2gram', 'subgroups': 'native language, GPA_range', 'exclude_zero_scores': 'Yes'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) newdata = self.parser.validate_config() # Add data to `ConfigurationParser` object self.parser._config = newdata newdata = self.parser.process_config() @raises(ValueError) def test_process_fields_with_integer(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'empWt', 'use_scaled_predictions': 'True', 'feature_prefix': '1gram, 2gram', 'subgroups': 'native language, GPA_range', 'exclude_zero_scores': 1} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) newdata = self.parser.validate_config() # Add data to `ConfigurationParser` object self.parser._config = newdata newdata = self.parser.process_config() @raises(ValueError) def test_invalid_skll_objective(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'LinearSVR', 'skll_objective': 'squared_error'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_wrong_skll_model_for_expected_scores(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'LinearSVR', 'predict_expected_scores': 'true'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() @raises(ValueError) def test_builtin_model_for_expected_scores(self): data = {'experiment_id': 'experiment_1', 'train_file': 'data/rsmtool_smTrain.csv', 'test_file': 'data/rsmtool_smEval.csv', 'description': 'Test', 'model': 'NNLR', 'predict_expected_scores': 'true'} # Add data to `ConfigurationParser` object self.parser.load_config_from_dict(data) self.parser.validate_config() def test_get_correct_configparser_cfg(self): config_parser = ConfigurationParser.get_configparser('config.cfg') assert isinstance(config_parser, CFGConfigurationParser) def test_get_correct_configparser_json(self): config_parser = ConfigurationParser.get_configparser('config.json') assert isinstance(config_parser, JSONConfigurationParser)
def run_experiment(config_file_or_obj, output_dir): """ Run RSMTool experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved # Get absolute paths to output directories csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) featuredir = abspath(join(output_dir, 'feature')) # Make directories, if necessary makedirs(csvdir, exist_ok=True) makedirs(figdir, exist_ok=True) makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read from file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config( config_file_or_obj) # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # Get output format file_format = configuration.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(configuration['experiment_id']) # Get the paths and names for the DataReader (file_names, file_paths_org) = configuration.get_names_and_paths( ['train_file', 'test_file', 'features', 'feature_subset_file'], ['train', 'test', 'feature_specs', 'feature_subset_specs']) file_paths = DataReader.locate_files(file_paths_org, configpath) # if there are any missing files after trying to locate # all expected files, raise an error if None in file_paths: missing_file_paths = [ file_paths_org[idx] for idx, path in enumerate(file_paths) if path is None ] raise FileNotFoundError('The following files were not found: ' '{}'.format(repr(missing_file_paths))) # Use the default converter for both train and test converters = { 'train': configuration.get_default_converter(), 'test': configuration.get_default_converter() } logger.info('Reading in all data from files.') # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read() logger.info('Preprocessing all features.') # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(configuration, data_container) # Rename certain frames with more descriptive names # for writing out experiment files rename_dict = { 'train_excluded': 'train_excluded_responses', 'test_excluded': 'test_excluded_responses', 'train_length': 'train_response_lengths', 'train_flagged': 'train_responses_with_excluded_flags', 'test_flagged': 'test_responses_with_excluded_flags' } logger.info('Saving training and test set data to disk.') # Write out files writer.write_experiment_output( csvdir, processed_container, [ 'train_features', 'test_features', 'train_metadata', 'test_metadata', 'train_other_columns', 'test_other_columns', 'train_preprocessed_features', 'test_preprocessed_features', 'train_excluded', 'test_excluded', 'train_length', 'test_human_scores', 'train_flagged', 'test_flagged' ], rename_dict, file_format=file_format) # Initialize the analyzer analyzer = Analyzer() (analyzed_config, analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool( processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, analyzed_container, file_format=file_format) logger.info('Training {} model.'.format(processed_config['model_name'])) # Initialize modeler modeler = Modeler() modeler.train(processed_config, processed_container, csvdir, figdir, file_format) # Identify the features used by the model selected_features = modeler.get_feature_names() # Add selected features to processed configuration processed_config['selected_features'] = selected_features # Write out files writer.write_feature_csv(featuredir, processed_container, selected_features, file_format=file_format) features_data_container = processed_container.copy() # Get selected feature info, and write out to file df_feature_info = features_data_container.feature_info.copy() df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin( selected_features)] selected_feature_dataset_dict = { 'name': 'selected_feature_info', 'frame': df_selected_feature_info } features_data_container.add_dataset(selected_feature_dataset_dict, update=True) writer.write_experiment_output( csvdir, features_data_container, dataframe_names=['selected_feature_info'], new_names_dict={'selected_feature_info': 'feature'}, file_format=file_format) logger.info('Running analyses on training set.') (train_analyzed_config, train_analyzed_container) = analyzer.run_training_analyses( processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, train_analyzed_container, reset_index=True, file_format=file_format) # Use only selected features for predictions columns_for_prediction = ['spkitemid', 'sc1'] + selected_features train_for_prediction = processed_container.train_preprocessed_features[ columns_for_prediction] test_for_prediction = processed_container.test_preprocessed_features[ columns_for_prediction] logged_str = 'Generating training and test set predictions' logged_str += ' (expected scores).' if configuration[ 'predict_expected_scores'] else '.' logger.info(logged_str) (pred_config, pred_data_container) = modeler.predict_train_and_test( train_for_prediction, test_for_prediction, processed_config) # Write out files writer.write_experiment_output( csvdir, pred_data_container, new_names_dict={'pred_test': 'pred_processed'}, file_format=file_format) original_coef_file = join( csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'], file_format)) # If coefficients file exists, then generate # scaled coefficients and save to file if exists(original_coef_file): logger.info('Scaling the coefficients and saving them to disk') try: # Scale coefficients, and return DataContainer w/ scaled coefficients scaled_data_container = modeler.scale_coefficients(pred_config) # Write out files to disk writer.write_experiment_output(csvdir, scaled_data_container, file_format=file_format) except AttributeError: raise ValueError( "It appears you are trying to save two different " "experiments to the same directory using the same " "ID. Please clear the content of the directory and " "rerun both experiments using different " "experiment IDs.") # Add processed data_container frames to pred_data_container new_pred_data_container = pred_data_container + processed_container logger.info('Running prediction analyses.') (pred_analysis_config, pred_analysis_data_container) = analyzer.run_prediction_analyses( new_pred_data_container, pred_config) # Write out files writer.write_experiment_output(csvdir, pred_analysis_data_container, reset_index=True, file_format=file_format) # Initialize reporter reporter = Reporter() # generate the report logger.info('Starting report generation.') reporter.create_report(processed_config, csvdir, figdir)
def setUp(self): self.parser = ConfigurationParser()
def test_init_non_json_file(self): with tempfile.NamedTemporaryFile(suffix=".txt") as tempf: _ = ConfigurationParser(tempf.name)
def test_get_correct_configparser_cfg(self): config_parser = ConfigurationParser.get_configparser('config.cfg') assert isinstance(config_parser, CFGConfigurationParser)
def test_get_correct_configparser_json(self): config_parser = ConfigurationParser.get_configparser('config.json') assert isinstance(config_parser, JSONConfigurationParser)
def run_experiment(config_file_or_obj, output_dir): """ Run RSMTool experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved # Get absolute paths to output directories csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) featuredir = abspath(join(output_dir, 'feature')) # Make directories, if necessary makedirs(csvdir, exist_ok=True) makedirs(figdir, exist_ok=True) makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read from file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj) # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # Get output format file_format = configuration.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(configuration['experiment_id']) # Get the paths and names for the DataReader (file_names, file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file', 'features', 'feature_subset_file'], ['train', 'test', 'feature_specs', 'feature_subset_specs']) file_paths = DataReader.locate_files(file_paths_org, configpath) # if there are any missing files after trying to locate # all expected files, raise an error if None in file_paths: missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths) if path is None] raise FileNotFoundError('The following files were not found: ' '{}'.format(repr(missing_file_paths))) # Use the default converter for both train and test converters = {'train': configuration.get_default_converter(), 'test': configuration.get_default_converter()} logger.info('Reading in all data from files.') # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read() logger.info('Preprocessing all features.') # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(configuration, data_container) # Rename certain frames with more descriptive names # for writing out experiment files rename_dict = {'train_excluded': 'train_excluded_responses', 'test_excluded': 'test_excluded_responses', 'train_length': 'train_response_lengths', 'train_flagged': 'train_responses_with_excluded_flags', 'test_flagged': 'test_responses_with_excluded_flags'} logger.info('Saving training and test set data to disk.') # Write out files writer.write_experiment_output(csvdir, processed_container, ['train_features', 'test_features', 'train_metadata', 'test_metadata', 'train_other_columns', 'test_other_columns', 'train_preprocessed_features', 'test_preprocessed_features', 'train_excluded', 'test_excluded', 'train_length', 'test_human_scores', 'train_flagged', 'test_flagged'], rename_dict, file_format=file_format) # Initialize the analyzer analyzer = Analyzer() (analyzed_config, analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, analyzed_container, file_format=file_format) logger.info('Training {} model.'.format(processed_config['model_name'])) # Initialize modeler modeler = Modeler() modeler.train(processed_config, processed_container, csvdir, figdir, file_format) # Identify the features used by the model selected_features = modeler.get_feature_names() # Add selected features to processed configuration processed_config['selected_features'] = selected_features # Write out files writer.write_feature_csv(featuredir, processed_container, selected_features, file_format=file_format) features_data_container = processed_container.copy() # Get selected feature info, and write out to file df_feature_info = features_data_container.feature_info.copy() df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)] selected_feature_dataset_dict = {'name': 'selected_feature_info', 'frame': df_selected_feature_info} features_data_container.add_dataset(selected_feature_dataset_dict, update=True) writer.write_experiment_output(csvdir, features_data_container, dataframe_names=['selected_feature_info'], new_names_dict={'selected_feature_info': 'feature'}, file_format=file_format) logger.info('Running analyses on training set.') (train_analyzed_config, train_analyzed_container) = analyzer.run_training_analyses(processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, train_analyzed_container, reset_index=True, file_format=file_format) # Use only selected features for predictions columns_for_prediction = ['spkitemid', 'sc1'] + selected_features train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction] test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction] logged_str = 'Generating training and test set predictions' logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.' logger.info(logged_str) (pred_config, pred_data_container) = modeler.predict_train_and_test(train_for_prediction, test_for_prediction, processed_config) # Write out files writer.write_experiment_output(csvdir, pred_data_container, new_names_dict={'pred_test': 'pred_processed'}, file_format=file_format) original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'], file_format)) # If coefficients file exists, then generate # scaled coefficients and save to file if exists(original_coef_file): logger.info('Scaling the coefficients and saving them to disk') try: # Scale coefficients, and return DataContainer w/ scaled coefficients scaled_data_container = modeler.scale_coefficients(pred_config) # Write out files to disk writer.write_experiment_output(csvdir, scaled_data_container, file_format=file_format) except AttributeError: raise ValueError("It appears you are trying to save two different " "experiments to the same directory using the same " "ID. Please clear the content of the directory and " "rerun both experiments using different " "experiment IDs.") # Add processed data_container frames to pred_data_container new_pred_data_container = pred_data_container + processed_container logger.info('Running prediction analyses.') (pred_analysis_config, pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container, pred_config) # Write out files writer.write_experiment_output(csvdir, pred_analysis_data_container, reset_index=True, file_format=file_format) # Initialize reporter reporter = Reporter() # generate the report logger.info('Starting report generation.') reporter.create_report(processed_config, csvdir, figdir)
def test_init_directory_instead_of_file(self): with tempfile.TemporaryDirectory() as tempd: _ = ConfigurationParser(tempd)
def test_init_nonexistent_file(self): non_existent_file = "/x/y.json" _ = ConfigurationParser(non_existent_file)
def run_comparison(config_file_or_obj, output_dir): """ Run an ``rsmcompare`` experiment using the given configuration file and generate the report in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config( config_file_or_obj, context='rsmcompare') # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = os.getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # get the information about the "old" experiment experiment_id_old = configuration['experiment_id_old'] experiment_dir_old = DataReader.locate_files( configuration['experiment_dir_old'], configpath) if not experiment_dir_old: raise FileNotFoundError("The directory {} " "does not exist.".format( configuration['experiment_dir_old'])) else: csvdir_old = normpath(join(experiment_dir_old, 'output')) figdir_old = normpath(join(experiment_dir_old, 'figure')) if not exists(csvdir_old) or not exists(figdir_old): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_old)) check_experiment_id(experiment_dir_old, experiment_id_old) # get the information about the "new" experiment experiment_id_new = configuration['experiment_id_new'] experiment_dir_new = DataReader.locate_files( configuration['experiment_dir_new'], configpath) if not experiment_dir_new: raise FileNotFoundError("The directory {} " "does not exist.".format( configuration['experiment_dir_new'])) else: csvdir_new = normpath(join(experiment_dir_new, 'output')) figdir_new = normpath(join(experiment_dir_new, 'figure')) if not exists(csvdir_new) or not exists(figdir_new): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_new)) check_experiment_id(experiment_dir_new, experiment_id_new) # are there specific general report sections we want to include? general_report_sections = configuration['general_sections'] # what about the special or custom sections? special_report_sections = configuration['special_sections'] custom_report_section_paths = configuration['custom_sections'] # if custom report sections exist, locate sections; otherwise, create empty list if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = Reporter.locate_custom_sections( custom_report_section_paths, configpath) else: custom_report_sections = [] # get the section order section_order = configuration['section_order'] # get the subgroups if any subgroups = configuration.get('subgroups') # Initialize reporter reporter = Reporter() chosen_notebook_files = reporter.get_ordered_notebook_files( general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmcompare') # add chosen notebook files to configuration configuration['chosen_notebook_files'] = chosen_notebook_files # now generate the comparison report logger.info('Starting report generation.') reporter.create_comparison_report(configuration, csvdir_old, figdir_old, csvdir_new, figdir_new, output_dir)
def run_summary(config_file_or_obj, output_dir): """ Run rsmsummarize experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) os.makedirs(csvdir, exist_ok=True) os.makedirs(figdir, exist_ok=True) os.makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj, context='rsmsummarize') # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = os.getcwd() # get the list of the experiment dirs experiment_dirs = configuration['experiment_dirs'] # check the experiment dirs and assemble the list of csvdir and jsons all_experiments = [] for experiment_dir in experiment_dirs: experiments = check_experiment_dir(experiment_dir, configpath) all_experiments.extend(experiments) # get the subgroups if any # Note: at the moment no comparison are reported for subgroups. # this option is added to the code to make it easier to add # subgroup comparisons in future versions subgroups = configuration.get('subgroups') general_report_sections = configuration['general_sections'] # get any special sections that the user might have specified special_report_sections = configuration['special_sections'] # get any custom sections and locate them to make sure # that they exist, otherwise raise an exception custom_report_section_paths = configuration['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = configuration['section_order'] # Initialize reporter reporter = Reporter() # check all sections values and order and get the # ordered list of notebook files chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmsummarize') # add chosen notebook files to configuration configuration['chosen_notebook_files'] = chosen_notebook_files # now generate the comparison report logger.info('Starting report generation') reporter.create_summary_report(configuration, all_experiments, csvdir)