示例#1
0
 def _yaml_config_run(self, predict_yaml_config_path,
                      tmp_output_dir, model_meta_file):
     predict_config_rewritten = rewrite_config(predict_yaml_config_path,
                                               tmp_output_dir,
                                               replace_dict={'model_meta_file':
                                                                 ('replace with model_meta_file',
                                                                  model_meta_file),
                                                             'output_dir':
                                                                 ('replace with tmp_output_dir',
                                                                  str(tmp_output_dir))})
     predict_outputs_before = glob(os.path.join(str(tmp_output_dir),
                                                'predict_output*',
                                                'features_created*'))
     hvc.predict(predict_config_rewritten)
     # helper function with assertions shared by all
     # tests for hvc.select run with config.yml files
     predict_outputs_after = glob(os.path.join(str(tmp_output_dir),
                                               'predict_output*',
                                               'features_created*'))
     predict_output = [after for after in predict_outputs_after
                       if after not in predict_outputs_before]
     # should only be one summary output file
     if len(predict_output) != 1:
         raise ValueError('found wrong number of predict outputs after '
                          'running .yaml config {}.\n'
                          'This was the output found: {}'
                          .format(predict_config_rewritten, predict_output))
     else:
         predict_output = predict_output[0]
     predict = joblib.load(predict_output)
     self._generic_predict_asserts(predict)
示例#2
0
 def test_predict_flatwindow_data_dirs(self, tmp_output_dir, test_data_dir):
     # tests predict with svm model, using data dirs
     data_dirs = [
         'cbins/gy6or6/032312',
         'cbins/gy6or6/032412']
     data_dirs = [
         os.path.join(test_data_dir,
                      os.path.normpath(data_dir))
         for data_dir in data_dirs
     ]
     file_format = 'cbin'
     model_meta_file = os.path.join(test_data_dir,
                                    'model_files',
                                    'flatwindow.meta')
     output_dir = tmp_output_dir
     # explicitly set segment to None because we want to test
     # that default behavior works that happens when
     # we supply argument for data_dirs parameter, **and**
     # segment is set to None (as it should be by default)
     segment = None
     predict_proba = False
     return_predictions = True
     predict = hvc.predict(data_dirs=data_dirs,
                           file_format=file_format,
                           model_meta_file=model_meta_file,
                           segment=segment,
                           output_dir=str(tmp_output_dir),
                           predict_proba=predict_proba,
                           return_predictions=return_predictions)
     self._generic_predict_asserts(predict)
示例#3
0
 def test_data_dirs_cbins(self, tmp_output_dir, test_data_dir):
     """test that calling predict doesn't fail when we
     pass a data_dirs list that contain cbin audio files"""
     data_dirs = [
         'cbins/gy6or6/032312',
         'cbins/gy6or6/032412']
     data_dirs = [
         os.path.join(test_data_dir,
                      os.path.normpath(data_dir))
         for data_dir in data_dirs
     ]
     file_format = 'cbin'
     model_meta_file = ''
     output_dir = tmp_output_dir
     # explicitly set segment to None because we want to test
     # that default behavior works that happens when
     # we supply argument for data_dirs parameter, **and**
     # segment is set to None (as it should be by default)
     segment = None
     predict_proba = False
     convert_to = 'cbin'  # to check that this works
     return_predictions = True
     predict = hvc.predict(data_dirs=data_dirs,
                           file_format=file_format,
                           model_meta_file=model_meta_file,
                           segment=segment,
                           predict_proba=predict_proba,
                           convert_to=convert_to,
                           return_predictions=return_predictions)
     assert type(predict) == dict
     for key in ['labels', 'pred_labels', 'songfile_IDs', 'onsets_Hz', 'offsets_Hz',
                 'features',]:
         assert key in predict
示例#4
0
 def test_predict_knn_data_dirs_notmat(self, tmp_output_dir, test_data_dir):
     # tests predict with knn model, using data dirs, and
     # converting output to notmat files
     data_dirs = [
         'cbins/gy6or6/032312',
         'cbins/gy6or6/032412']
     data_dirs = [
         os.path.join(test_data_dir,
                      os.path.normpath(data_dir))
         for data_dir in data_dirs
     ]
     file_format = 'cbin'
     model_meta_file = os.path.join(test_data_dir,
                                    'model_files',
                                    'knn.meta')
     output_dir = tmp_output_dir
     # explicitly set segment to None because we want to test
     # that default behavior works that happens when
     # we supply argument for data_dirs parameter, **and**
     # segment is set to None (as it should be by default)
     segment = None
     predict_proba = False
     convert_to = 'notmat'  # to check that this works
     return_predictions = True
     predict = hvc.predict(data_dirs=data_dirs,
                           file_format=file_format,
                           model_meta_file=model_meta_file,
                           segment=segment,
                           output_dir=str(tmp_output_dir),
                           predict_proba=predict_proba,
                           convert_to=convert_to,
                           return_predictions=return_predictions)
     assert type(predict) == dict
     for key in ['labels', 'pred_labels', 'songfile_IDs', 'onsets_Hz', 'offsets_Hz',
                 'features',]:
         assert key in predict
示例#5
0
def run_main_workflow(tmp_output_dir, script_tuple_dict, configs_path):
    """tests main workflow for hybrid-vocal-classifier
    by iterating through test_main_workflow_dict,
    running the scripts named in each tuple in the dict
    """

    extract_config_filename = os.path.join(configs_path,
                                           script_tuple_dict['extract'])
    replace_dict = {
        'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
    }
    # have to put tmp_output_dir into yaml file
    extract_config_rewritten = rewrite_config(extract_config_filename,
                                              tmp_output_dir, replace_dict)
    hvc.extract(extract_config_rewritten)
    extract_outputs = list(
        filter(os.path.isdir,
               glob(os.path.join(str(tmp_output_dir), '*extract*'))))
    extract_outputs.sort(key=os.path.getmtime)
    extract_output_dir = (extract_outputs[-1]
                          )  # [-1] is newest dir, after sort
    assert check_extract_output(extract_output_dir)

    feature_file = glob(os.path.join(extract_output_dir, 'features_created*'))
    feature_file = feature_file[0]  # because glob returns list

    os.remove(extract_config_rewritten)

    select_and_predict_tuples = script_tuple_dict['select and predict']
    for select_and_predict_tuple in select_and_predict_tuples:
        (select_config_filename,
         predict_config_filename) = select_and_predict_tuple
        select_config_filename = os.path.join(configs_path,
                                              select_config_filename)

        select_config_rewritten = rewrite_config(
            select_config_filename,
            tmp_output_dir,
            replace_dict={
                'feature_file': ('replace with feature_file', feature_file),
                'output_dir':
                ('replace with tmp_output_dir', str(tmp_output_dir))
            })
        hvc.select(select_config_rewritten)
        select_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*select*'))))
        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        assert check_select_output(select_config_rewritten, select_output_dir)
        os.remove(select_config_rewritten)

        select_outputs.sort(key=os.path.getmtime)
        select_output_dir = (select_outputs[-1]
                             )  # [-1] is newest dir, after sort
        model_meta_files = glob(os.path.join(select_output_dir, '*', '*meta*'))
        replace_dict = {
            'model_meta_file':
            ('replace with model_file', model_meta_files[-1]),
            'output_dir': ('replace with tmp_output_dir', str(tmp_output_dir))
        }
        predict_config_filename_with_path = os.path.join(
            configs_path, predict_config_filename)

        predict_config_rewritten = rewrite_config(
            predict_config_filename_with_path, tmp_output_dir, replace_dict)
        hvc.predict(predict_config_rewritten)
        os.remove(predict_config_rewritten)
        predict_outputs = list(
            filter(os.path.isdir,
                   glob(os.path.join(str(tmp_output_dir), '*predict*'))))
        predict_outputs.sort(key=os.path.getmtime)
        predict_output_dir = (predict_outputs[-1]
                              )  # [-1] is newest dir, after sort
        feature_files = glob(os.path.join(predict_output_dir, 'feature*'))
        for ftr_filename in feature_files:
            ftr_file = joblib.load(ftr_filename)
            assert 'pred_labels' in ftr_file
            if 'predict_proba_True' in extract_config_filename:
                assert 'pred_probs' in ftr_file
                assert ftr_file['pred_labels'].shape[0] == ftr_file[
                    'pred_probs'].shape[0]
示例#6
0
from glob import glob
import hvc

# 0. create training data
# In this case, we download already labeled data from an open repository.
# String in quotes matches with the name of one of the folders in the repository.
hvc.utils.fetch('gy6or6.032612')

# 1. pick a model and 2. extract features for that model
# Model and features are defined in extract.config.yml file.
hvc.extract('gy6or6_autolabel_example.knn.extract.config.yml')

# 3. pick hyperparameters for model
# Load summary feature file to use with helper functions for
# finding best hyperparameters.
summary_file = glob('./extract_output*/summary*')
summary_data = hvc.load_feature_file(summary_file)
# In this case, we picked a k-nearest neighbors model
# and we want to find what value of k will give us the highest accuracy
cv_scores, best_k = hvc.utils.find_best_k(summary_data['features'],
                                          summary_data['labels'],
                                          k_range=range(1, 11))

# 4. Fit the **model** to the data and 5. Select the **best** model
hvc.select('gy6or6_autolabel.example.select.knn.config.yml')

# 6. **Predict** labels for unlabeled data using the fit model.
hvc.predict('gy6or6_autolabel.example.predict.knn.config.yml')