def test_generate_specs_from_data_no_subset_specs(): np.random.seed(10) data = {'Grammar': np.random.randn(10), 'Fluency': np.random.randn(10), 'Discourse': np.random.randn(10), 'r1': np.random.choice(4, 10), 'spkitemlab': ['a-5'] * 10} df = pd.DataFrame(data) df = pd.DataFrame(data) specs = generate_specs_from_data(['Grammar', 'Fluency', 'Discourse'], 'r1', df) feats = specs['features'] assert_equal(len(feats), 3) assert_array_equal([f['feature'] for f in feats], ['Grammar', 'Fluency', 'Discourse']) assert_equal(feats[0]['sign'], 1) assert_equal(feats[1]['sign'], 1) assert_equal(feats[2]['sign'], 1)
def test_generate_specs_from_data_with_transformation(): feature_subset_specs = pd.DataFrame({'Feature': ['Grammar', 'Vocabulary', 'Fluency', 'Content_coverage', 'Discourse'], 'Sign_SYS1': ['-', '+', '+', '+', '-']}) np.random.seed(10) r1 = np.random.choice(range(1, 5), 10) data = {'Grammar': np.random.randn(10), 'Vocabulary': r1**2, 'Discourse': np.random.randn(10), 'r1': r1, 'spkitemlab': ['a-5'] * 10} df = pd.DataFrame(data) specs = generate_specs_from_data(['Grammar', 'Vocabulary', 'Discourse'], 'r1', df, feature_subset_specs, 'SYS1') feats = specs['features'] assert_equal(feats[1]['feature'], 'Vocabulary') assert_equal(feats[1]['transform'], 'sqrt')
def test_generate_specs_from_data_with_negative_sign(): feature_subset_specs = pd.DataFrame({'Feature': ['Grammar', 'Vocabulary', 'Fluency', 'Content_coverage', 'Discourse'], 'Sign_SYS1': ['-', '+', '+', '+', '-']}) np.random.seed(10) data = {'Grammar': np.random.randn(10), 'Fluency': np.random.randn(10), 'Discourse': np.random.randn(10), 'r1': np.random.choice(4, 10), 'spkitemlab': ['a-5'] * 10} df = pd.DataFrame(data) specs = generate_specs_from_data(['Grammar', 'Fluency', 'Discourse'], 'r1', df, feature_subset_specs, 'SYS1') feats = specs['features'] assert_equal(len(feats), 3) assert_array_equal([f['feature'] for f in feats], ['Grammar', 'Fluency', 'Discourse']) assert_equal(feats[0]['sign'], -1) assert_equal(feats[1]['sign'], 1) assert_equal(feats[2]['sign'], -1)
def load_experiment_data(main_config_file, outdir): """ Set up the experiment by loading the training and evaluation data sets and preprocessing them. """ logger = logging.getLogger(__name__) # read in the main config file logger.info('Reading configuration file: {}'.format(main_config_file)) config_obj = read_json_file(main_config_file) config_obj = check_main_config(config_obj) # get the directory where the config file lives configpath = dirname(main_config_file) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the description description = config_obj['description'] # get the column name for the labels for the training and testing data train_label_column = config_obj['train_label_column'] test_label_column = config_obj['test_label_column'] # get the column name that will hold the ID for # both the training and the test data id_column = config_obj['id_column'] # get the specified trim min and max values spec_trim_min, spec_trim_max = get_trim_min_max(config_obj) # get the name of the optional column that # contains response length. length_column = config_obj['length_column'] # get the name of the optional column that # contains the second human score second_human_score_column = config_obj['second_human_score_column'] # get the name of the optional column that # contains the candidate ID candidate_column = config_obj['candidate_column'] # if the test label column is the same as the # second human score column, raise an error if test_label_column == second_human_score_column: raise ValueError("'test_label_column' and " "'second_human_score_column' cannot have the " "same value.") # get the name of the model that we want to train and # check that it's valid model_name = config_obj['model'] model_type = check_model_name(model_name) # are we excluding zero scores? exclude_zero_scores = config_obj['exclude_zero_scores'] # if we are excluding zero scores but trim_min # is set to 0, then we need to warn the user if exclude_zero_scores and spec_trim_min == 0: logger.warning("'exclude_zero_scores' is set to True but " "'trim_min' is set to 0. This may cause " " unexpected behavior.") # are we filtering on any other columns? flag_column_dict = check_flag_column(config_obj) # are we generating fake labels? use_fake_train_labels = train_label_column == 'fake' use_fake_test_labels = test_label_column == 'fake' # are we analyzing scaled or raw prediction values use_scaled_predictions = config_obj['use_scaled_predictions'] # get the subgroups if any subgroups = config_obj.get('subgroups') # are there specific general report sections we want to include? general_report_sections = config_obj['general_sections'] # what about the special or custom sections? special_report_sections = config_obj['special_sections'] custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections(custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=model_type, context='rsmtool') # Read in the feature configurations. # Location of feature file feature_field = config_obj['features'] # Check whether feature subset file exists and whether we are using # feature subset of prefix feature_subset_file = config_obj['feature_subset_file'] if feature_subset_file: feature_subset_file_location = locate_file(feature_subset_file, configpath) if not feature_subset_file_location: raise FileNotFoundError('Feature subset file {} not ' 'found.\n'.format(config_obj['feature_subset_file'])) feature_subset = config_obj['feature_subset'] feature_prefix = config_obj['feature_prefix'] # if the user requested feature_subset file and feature subset, # read the file and check its format if feature_subset_file and feature_subset: feature_subset_specs = pd.read_csv(feature_subset_file_location) check_feature_subset_file(feature_subset_specs, feature_subset) else: feature_subset_specs = None # Do we need to automatically find the best transformations/change sign? select_transformations = config_obj['select_transformations'] feature_sign = config_obj['sign'] requested_features = [] feature_specs = {} select_features_automatically = True # For backward compatibility, we check whether this field can # be set to all and set the select_transformations to true # as was done in the previous version. if feature_field == 'all': select_transformations = True elif feature_field is not None: feature_file_location = locate_file(feature_field, configpath) select_features_automatically = False if not feature_file_location: raise FileNotFoundError('Feature file {} not ' 'found.\n'.format(config_obj['features'])) else: logger.info('Reading feature file: {}'.format(feature_file_location)) feature_json = read_json_file(feature_file_location) feature_specs = normalize_and_validate_feature_file(feature_json) requested_features = [fdict['feature'] for fdict in feature_specs['features']] # check to make sure that `length_column` or `second_human_score_column` # are not also included in the requested features, if they are specified if (length_column and length_column in requested_features): raise ValueError("The value of 'length_column' ('{}') cannot be " "used as a model feature.".format(length_column)) if (second_human_score_column and second_human_score_column in requested_features): raise ValueError("The value of 'second_human_score_column' ('{}') cannot be " "used as a model feature.".format(second_human_score_column)) # Specify column names that cannot be used as features reserved_column_names = list(set(['spkitemid', 'spkitemlab', 'itemType', 'r1', 'r2', 'score', 'sc', 'sc1', 'adj', train_label_column, test_label_column, id_column] + subgroups + list(flag_column_dict.keys()))) # if `second_human_score_column` is specified, then # we need to add `sc2` to the list of reserved column # names. And same for 'length' and 'candidate', if `length_column` # and `candidate_column` are specified if second_human_score_column: reserved_column_names.append('sc2') if length_column: reserved_column_names.append('length') if candidate_column: reserved_column_names.append('candidate') # Make sure that the training data as specified in the # config file actually exists on disk and if it does, # load it and filter out the bad rows and features with # zero standard deviation. Also double check that the requested # features exist in the data or obtain the feature names if # no feature file was given. train_file_location = locate_file(config_obj['train_file'], configpath) if not train_file_location: raise FileNotFoundError('Error: Training file {} ' 'not found.\n'.format(config_obj['train_file'])) else: logger.info('Reading training data: {}'.format(train_file_location)) (df_train_features, df_train_metadata, df_train_other_columns, df_train_excluded, df_train_length, _, df_train_flagged_responses, used_trim_min, used_trim_max, feature_names) = load_and_filter_data(train_file_location, train_label_column, id_column, length_column, None, candidate_column, requested_features, reserved_column_names, spec_trim_min, spec_trim_max, flag_column_dict, subgroups, exclude_zero_scores=exclude_zero_scores, exclude_zero_sd=True, feature_subset_specs=feature_subset_specs, feature_subset=feature_subset, feature_prefix=feature_prefix, use_fake_labels=use_fake_train_labels) # Generate feature specifications now that we # know what features are selected if select_features_automatically: if select_transformations is False: feature_specs = generate_default_specs(feature_names) else: feature_specs = generate_specs_from_data(feature_names, 'sc1', df_train_features, feature_subset_specs=feature_subset_specs, feature_sign=feature_sign) # Sanity check to make sure the function returned the # same feature names as specified in feature json file, # if there was one elif not select_features_automatically: assert feature_names == requested_features # Do the same for the test data except we can ignore the trim min # and max since we already have that from the training data and # we have the feature_names when no feature file was specified. # We also allow features with 0 standard deviation in the test file. test_file_location = locate_file(config_obj['test_file'], configpath) if not test_file_location: raise FileNotFoundError('Error: Evaluation file ' '{} not found.\n'.format(config_obj['test_file'])) elif (test_file_location == train_file_location and train_label_column == test_label_column): logging.warning('The same data file and label ' 'column are used for both training ' 'and evaluating the model. No second ' 'score analysis will be performed, even ' 'if requested.') df_test_features = df_train_features.copy() df_test_metadata = df_train_metadata.copy() df_test_excluded = df_train_excluded.copy() df_test_other_columns = df_train_other_columns.copy() df_test_flagged_responses = df_train_flagged_responses.copy() df_test_human_scores = pd.DataFrame() else: logger.info('Reading evaluation data: {}'.format(test_file_location)) (df_test_features, df_test_metadata, df_test_other_columns, df_test_excluded, _, df_test_human_scores, df_test_flagged_responses, _, _, _) = load_and_filter_data(test_file_location, test_label_column, id_column, None, second_human_score_column, candidate_column, feature_names, reserved_column_names, used_trim_min, used_trim_max, flag_column_dict, subgroups, exclude_zero_scores=exclude_zero_scores, exclude_zero_sd=False, use_fake_labels=use_fake_test_labels) return (df_train_features, df_test_features, df_train_metadata, df_test_metadata, df_train_other_columns, df_test_other_columns, df_train_excluded, df_test_excluded, df_train_length, df_test_human_scores, df_train_flagged_responses, df_test_flagged_responses, experiment_id, description, train_file_location, test_file_location, feature_specs, model_name, model_type, train_label_column, test_label_column, id_column, length_column, second_human_score_column, candidate_column, subgroups, feature_subset_file, used_trim_min, used_trim_max, use_scaled_predictions, exclude_zero_scores, select_features_automatically, chosen_notebook_files)