def check_load_featureset(suffix, numeric_ids): num_feat_files = 5 # Create test data make_merging_data(num_feat_files, suffix, numeric_ids) # Load unmerged data and merge it dirpath = os.path.join(_my_dir, 'train', 'test_merging') featureset = ['{}'.format(i) for i in range(num_feat_files)] merged_examples = _load_featureset(dirpath, featureset, suffix, quiet=True) # Load pre-merged data featureset = ['all'] premerged_examples = _load_featureset(dirpath, featureset, suffix, quiet=True) assert np.all(merged_examples.ids == premerged_examples.ids) assert np.all(merged_examples.classes == premerged_examples.classes) assert np.all(merged_examples.features.todense() == premerged_examples.features.todense()) eq_(merged_examples.feat_vectorizer.feature_names_, premerged_examples.feat_vectorizer.feature_names_) eq_(merged_examples.feat_vectorizer.vocabulary_, premerged_examples.feat_vectorizer.vocabulary_)
def test_input_checking1(): """ Test merging featuresets with different number of examples """ dirpath = join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_2examples_1', 'test_input_3examples_1'] _load_featureset(dirpath, featureset, suffix, quiet=True)
def test_input_checking2(): """ Test joining featuresets that contain the same features for each instance """ dirpath = join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_3examples_1', 'test_input_3examples_1'] _load_featureset(dirpath, featureset, suffix, quiet=True)
def test_input_checking1(): ''' Ensure that we raise ValueError when trying to join featuresets with different number of examples. ''' dirpath = os.path.join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_2examples_1', 'test_input_3examples_1'] _load_featureset(dirpath, featureset, suffix, quiet=True)
def test_input_checking2(): ''' Ensure that we raise ValueError when trying to join featuresets that contain the same features for each instance. ''' dirpath = os.path.join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_3examples_1', 'test_input_3examples_1'] _load_featureset(dirpath, featureset, suffix, quiet=True)
def test_one_file_load_featureset(): """ Test loading a single file with _load_featureset """ dirpath = join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_2examples_1'] single_file_fs = _load_featureset(join(dirpath, 'test_input_2examples_1.jsonlines'), '', '', quiet=True) single_fs = _load_featureset(dirpath, featureset, suffix, quiet=True) eq_(single_file_fs, single_fs)
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = os.path.join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format # and convert it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = os.path.join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = os.path.join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` format featureset = [ '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files) ] merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert np.all(merged_examples.ids == premerged_examples.ids) assert np.all(merged_examples.classes == premerged_examples.classes) assert np.all(merged_examples.features.todense() == premerged_examples.features.todense()) eq_(merged_examples.feat_vectorizer.feature_names_, premerged_examples.feat_vectorizer.feature_names_) eq_(merged_examples.feat_vectorizer.vocabulary_, premerged_examples.feat_vectorizer.vocabulary_)
def check_specified_cv_folds(numeric_ids): make_cv_folds_data(numeric_ids) # test_cv_folds1.cfg has prespecified folds and should have ~50% accuracy # test_cv_folds2.cfg doesn't have prespecified folds and >95% accuracy for experiment_name, test_func, grid_size in [('test_cv_folds1', lambda x: x < 0.6, 3), ('test_cv_folds2', lambda x: x > 0.95, 10)]: config_template_file = '{}.template.cfg'.format(experiment_name) config_template_path = os.path.join(_my_dir, 'configs', config_template_file) config_path = os.path.join(_my_dir, fill_in_config_paths(config_template_path)) # Modify config file to change ids_to_floats depending on numeric_ids # setting with open(config_path, 'r+') as config_template_file: lines = config_template_file.readlines() config_template_file.seek(0) config_template_file.truncate() for line in lines: if line.startswith('ids_to_floats='): if numeric_ids: line = 'ids_to_floats=true\n' else: line = 'ids_to_floats=false\n' config_template_file.write(line) run_configuration(config_path, quiet=True) result_filename = ('{}_test_cv_folds_LogisticRegression.' + 'results').format(experiment_name) with open(os.path.join(_my_dir, 'output', result_filename)) as f: # check held out scores outstr = f.read() score = float(SCORE_OUTPUT_RE.search(outstr).groups()[-1]) assert test_func(score) grid_score_matches = GRID_RE.findall(outstr) assert len(grid_score_matches) == grid_size for match_str in grid_score_matches: assert test_func(float(match_str)) # try the same tests for just training (and specifying the folds for the # grid search) dirpath = os.path.join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_cv_folds'] examples = _load_featureset(dirpath, featureset, suffix, quiet=True) clf = Learner('LogisticRegression', probability=True) cv_folds = _load_cv_folds(os.path.join(_my_dir, 'train', 'test_cv_folds.csv')) grid_search_score = clf.train(examples, grid_search_folds=cv_folds, grid_objective='accuracy', grid_jobs=1) assert grid_search_score < 0.6 grid_search_score = clf.train(examples, grid_search_folds=5, grid_objective='accuracy', grid_jobs=1) assert grid_search_score > 0.95
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = join( dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = [ '{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files) ] merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def test_input_checking3(): """ Test to ensure that we correctly merge featuresets """ dirpath = join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_3examples_1', 'test_input_3examples_2'] examples_tuple = _load_featureset(dirpath, featureset, suffix, quiet=True) eq_(examples_tuple.features.shape[0], 3)
def test_input_checking3(): ''' Small test to ensure that we correctly merge featuresets. ''' dirpath = os.path.join(_my_dir, 'train') suffix = '.jsonlines' featureset = ['test_input_3examples_1', 'test_input_3examples_2'] examples_tuple = _load_featureset(dirpath, featureset, suffix, quiet=True) assert examples_tuple.features.shape[0] == 3
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)] merged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix) # the path to the unmerged feature files dirpath = os.path.join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Load each unmerged feature file in the `from_suffix` format # and convert it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, from_suffix)) output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) skll_convert.main(['--quiet', input_file_path, output_file_path]) # now load and merge all unmerged, converted features in the `to_suffix` format featureset = ['{}_{}'.format(feature_name_prefix, i) for i in range(num_feat_files)] merged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}_all'.format(feature_name_prefix)] premerged_examples = _load_featureset(dirpath, featureset, to_suffix, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format assert np.all(merged_examples.ids == premerged_examples.ids) assert np.all(merged_examples.classes == premerged_examples.classes) assert np.all(merged_examples.features.todense() == premerged_examples.features.todense()) eq_(merged_examples.feat_vectorizer.feature_names_, premerged_examples.feat_vectorizer.feature_names_) eq_(merged_examples.feat_vectorizer.vocabulary_, premerged_examples.feat_vectorizer.vocabulary_)
def check_load_featureset(suffix, numeric_ids): num_feat_files = 5 # Create test data make_merging_data(num_feat_files, suffix, numeric_ids) # Load unmerged data and merge it dirpath = join(_my_dir, 'train', 'test_merging') featureset = ['{}'.format(i) for i in range(num_feat_files)] merged_exs = _load_featureset(dirpath, featureset, suffix, quiet=True) # Load pre-merged data featureset = ['all'] premerged_exs = _load_featureset(dirpath, featureset, suffix, quiet=True) assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def test_cross_validate_task(): """ Test that 10-fold cross_validate experiments work. Test that fold ids get correctly saved. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_save_cv_folds" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check final average results with open( join( _my_dir, 'output', 'test_save_cv_folds_train_f0.' + 'jsonlines_LogisticRegression.results.json')) as f: result_dict = json.load(f)[10] assert_almost_equal(result_dict['score'], 0.517) # Check that the fold ids were saved correctly expected_skll_ids = {} examples = _load_featureset(train_path, '', suffix, quiet=True) kfold = StratifiedKFold(n_splits=10) for fold_num, (_, test_indices) in enumerate( kfold.split(examples.features, examples.labels)): for index in test_indices: expected_skll_ids[examples.ids[index]] = fold_num skll_fold_ids = {} with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f: reader = csv.DictReader(f) for row in reader: skll_fold_ids[row['id']] = row['cv_test_fold'] # convert the dictionary to strings (sorted by key) for quick comparison skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items())) expected_skll_ids_str = ''.join( '{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items())) assert_equal(skll_fold_ids_str, expected_skll_ids_str)
def test_cross_validate_task(): """ Test that 10-fold cross_validate experiments work. Test that fold ids get correctly saved. """ # Run experiment suffix = '.jsonlines' train_path = join(_my_dir, 'train', 'f0{}'.format(suffix)) config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_save_cv_folds" ".template.cfg"), train_path, None) run_configuration(config_path, quiet=True) # Check final average results with open(join(_my_dir, 'output', 'test_save_cv_folds_train_f0.' + 'jsonlines_LogisticRegression.results.json')) as f: result_dict = json.load(f)[10] assert_almost_equal(result_dict['score'], 0.517) # Check that the fold ids were saved correctly expected_skll_ids = {} examples = _load_featureset(train_path, '', suffix, quiet=True) kfold = StratifiedKFold(examples.labels, n_folds=10) for fold_num, (_, test_indices) in enumerate(kfold): for index in test_indices: expected_skll_ids[examples.ids[index]] = fold_num skll_fold_ids = {} with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f: reader = csv.DictReader(f) for row in reader: skll_fold_ids[row['id']] = row['cv_test_fold'] # convert the dictionary to strings (sorted by key) for quick comparison skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items())) expected_skll_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items())) assert_equal(skll_fold_ids_str, expected_skll_ids_str)
def check_convert_featureset(from_suffix, to_suffix, with_labels=True): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=with_labels) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, from_suffix)) output_file_path = join(dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, to_suffix)) skll_convert_args = ['--quiet', input_file_path, output_file_path] if not with_labels: skll_convert_args.append('--no_labels') skll_convert.main(skll_convert_args) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = ['{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in range(num_feat_files)] label_col = 'y' if with_labels else None merged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format # first check the IDs assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))
def check_convert_featureset(from_suffix, to_suffix, with_labels=True): num_feat_files = 5 # Create test data make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=with_labels) # the path to the unmerged feature files dirpath = join(_my_dir, 'train', 'test_conversion') # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Load each unmerged feature file in the `from_suffix` format and convert # it to the `to_suffix` format for feature in range(num_feat_files): input_file_path = join( dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, from_suffix)) output_file_path = join( dirpath, '{}_{}{}{}'.format(feature_name_prefix, feature, with_labels_part, to_suffix)) skll_convert_args = ['--quiet', input_file_path, output_file_path] if not with_labels: skll_convert_args.append('--no_labels') skll_convert.main(skll_convert_args) # now load and merge all unmerged, converted features in the `to_suffix` # format featureset = [ '{}_{}{}'.format(feature_name_prefix, i, with_labels_part) for i in range(num_feat_files) ] label_col = 'y' if with_labels else None merged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # Load pre-merged data in the `to_suffix` format featureset = ['{}{}_all'.format(feature_name_prefix, with_labels_part)] premerged_exs = _load_featureset(dirpath, featureset, to_suffix, label_col=label_col, quiet=True) # make sure that the pre-generated merged data in the to_suffix format # is the same as the converted, merged data in the to_suffix format # first check the IDs assert_array_equal(merged_exs.ids, premerged_exs.ids) assert_array_equal(merged_exs.labels, premerged_exs.labels) for (_, _, merged_feats), (_, _, premerged_feats) in zip(merged_exs, premerged_exs): eq_(merged_feats, premerged_feats) eq_(sorted(merged_exs.vectorizer.feature_names_), sorted(premerged_exs.vectorizer.feature_names_))