def test_safe_float_conversion(): for input_val, expected_val in zip(['1.234', 1.234, '3.0', '3', 3, 'foo'], [1.234, 1.234, 3.0, 3, 3, 'foo']): yield check_safe_float_conversion, safe_float(input_val), expected_val
def parse_config_file(config_path, log_level=logging.INFO): """ Parses a SKLL experiment configuration file with the given path. Log messages with the given log level (default: INFO). Parameters ---------- config_path : str The path to the configuration file. log_level : logging level, optional The logging level to use. Defaults to ``logging.INFO``. Returns ------- experiment_name : str A string used to identify this particular experiment configuration. When generating result summary files, this name helps prevent overwriting previous summaries. task : str The types of experiment we're trying to run (e.g. 'cross_validate'). sampler : str The name of a sampler to perform non-linear transformations of the input. fixed_sampler_parameters : dict A dictionary containing parameters you want to have fixed for the sampler feature_hasher : bool If True, this enables a high-speed, low-memory vectorizer that uses feature hashing for converting feature dictionaries into NumPy arrays instead of using a DictVectorizer. hasher_features : int The number of features used by the FeatureHasher if the feature_hasher flag is enabled. id_col : str The column with IDs. label_col : str The column with labels. train_set_name : str The name of the training set. test_set_name : str The name of the test set. suffix : str The file format the training/test files are in. featuresets : list of str A list of lists of prefixes for the files containing the features you would like to train/test on. do_shuffle : bool Whether to shuffle the data. model_path : str The path to the model file(s). do_grid_search : bool Whether to perform grid search. grid_objectives : list of str A list of scoring functions to use for tuning. probability : bool Whether to output probabilities for each class. pipeline : bool Whether to include the `pipeline` attribute in the trained model. This will increase the size of the model file. results_path : str Path to store result files in. pos_label_str : str The string label for the positive class in the binary classification setting. feature_scaling : str How to scale features (e.g. 'with_mean'). min_feature_count : int The minimum number of examples for which the value of a feature must be nonzero to be included in the model. folds_file : str The path to the folds_file, if specified. grid_search_jobs : int Number of folds to run in parallel when using grid search. grid_search_folds : int The number of folds to use for grid search. cv_folds : dict or int The specified folds mapping, or the number of folds. save_cv_folds : bool Whether to save CV Folds to file. save_cv_models : bool Whether to save CV models. use_folds_file_for_grid_search : bool Whether to use folds file for grid search. do_stratified_folds : bool Whether to use random folds for cross-validation. fixed_parameter_list : list of dict List of dicts containing parameters you want to have fixed for each classifier in learners list. param_grid_list : list of dict List of parameter grids to search, one dict for each learner. featureset_names : list of str The names of the featuresets used for each job. learners : list of str A list of learners to try using. prediction_dir : str The directories where predictions are saved. log_path : str The path to the log file. train_path : str The path to a file containing feature to train on. test_path : str The path to a file containing features to test on. ids_to_floats : bool Whether to convert IDs to floats. class_map : dict A class map collapsing several labels into one. custom_learner_path : str Path to a .py file that defines a custom learner. custom_metric_path : str Path to a .py file that defines a custom metric. learning_curve_cv_folds_list : list of int A list of integers specifying the number of folds to use for CV. learning_curve_train_sizes : list of float or list of int List of floats or integers representing relative or absolute numbers of training examples that will be used to generate the learning curve respectively. output_metrics : list A list of output metrics to use. Raises ------ IOError If configuration file name is empty ValueError If various configuration parameters are incorrectly specified, or cause conflicts. """ # check that config_path is not empty if config_path == "": raise IOError("The name of the configuration file is empty") # compute the absolute path for the config file config_path = realpath(config_path) config_dir = dirname(config_path) # set up a config parser with the above default values config = _setup_config_parser(config_path) # extract parameters from the various sections in the config file ###################### # 1. General section # ###################### if config.has_option("General", "experiment_name"): experiment_name = config.get("General", "experiment_name") else: raise ValueError("Configuration file does not contain experiment_name " "in the [General] section.") # next, get the log path before anything else since we need to # save all logging messages to a log file in addition to displaying # them on the console try: log_path = locate_file(config.get("Output", "log"), config_dir) except IOError as e: if e.errno == errno.ENOENT: log_path = e.filename os.makedirs(log_path) # Create a top-level log file under the log path main_log_file = join(log_path, '{}.log'.format(experiment_name)) # Now create a SKLL logger that will log to this file as well # as to the console. Use the log level provided - note that # we only have to do this the first time we call `get_skll_logger()` # with a given name. logger = get_skll_logger('experiment', filepath=main_log_file, log_level=log_level) if config.has_option("General", "task"): task = config.get("General", "task") else: raise ValueError("Configuration file does not contain task in the " "[General] section.") if task not in VALID_TASKS: raise ValueError('An invalid task was specified: {}. Valid tasks are:' ' {}'.format(task, ', '.join(VALID_TASKS))) #################### # 2. Input section # #################### sampler = config.get("Input", "sampler") if sampler not in VALID_SAMPLERS: raise ValueError('An invalid sampler was specified: {}. Valid ' 'samplers are: {}'.format(sampler, ', '.join(VALID_SAMPLERS))) # produce warnings if feature_hasher is set but hasher_features # is less than or equal to zero. feature_hasher = config.getboolean("Input", "feature_hasher") hasher_features = config.getint("Input", "hasher_features") if feature_hasher: if hasher_features <= 0: raise ValueError( "Configuration file must specify a non-zero value " "for the option hasher_features when " "feature_hasher is True.") # produce warnings if hasher_features is set but feature_hasher # is not set correctly elif hasher_features > 0: logger.warning( "Ignoring hasher_features since feature_hasher is either" " missing or set to False.") if config.has_option("Input", "learners"): learners_string = config.get("Input", "learners") else: raise ValueError( "Configuration file does not contain list of learners " "in [Input] section.") learners = yaml.safe_load(fix_json(learners_string)) if len(learners) == 0: raise ValueError( "Configuration file contains an empty list of learners" " in the [Input] section.") elif len(set(learners)) < len(learners): raise ValueError( 'Configuration file contains the same learner multiple' ' times, which is not currently supported. Please use' ' param_grids with tuning to find the optimal settings' ' for the learner.') custom_learner_path = locate_file( config.get("Input", "custom_learner_path"), config_dir) # get the custom metric path, if specified, and locate it custom_metric_path = locate_file(config.get("Input", "custom_metric_path"), config_dir) # get the featuresets featuresets_string = config.get("Input", "featuresets") featuresets = yaml.safe_load(fix_json(featuresets_string)) # ensure that featuresets is either a list of features or a list of lists # of features if not isinstance(featuresets, list) or not all( isinstance(fs, list) for fs in featuresets): raise ValueError("The featuresets parameter should be a list of " "features or a list of lists of features. You " "specified: {}".format(featuresets)) featureset_names = yaml.safe_load( fix_json(config.get("Input", "featureset_names"))) # ensure that featureset_names is a list of strings, if specified if featureset_names: if (not isinstance(featureset_names, list) or not all([isinstance(fs, str) for fs in featureset_names])): raise ValueError( "The featureset_names parameter should be a list " "of strings. You specified: {}".format(featureset_names)) # get the value for learning_curve_cv_folds and ensure # that it's a list of the same length as the value of # learners. If it's not specified, then we just assume # that we are using 10 folds for each learner. learning_curve_cv_folds_list_string = config.get( "Input", "learning_curve_cv_folds_list") learning_curve_cv_folds_list = yaml.safe_load( fix_json(learning_curve_cv_folds_list_string)) if len(learning_curve_cv_folds_list) == 0: learning_curve_cv_folds_list = [10] * len(learners) else: if (not isinstance(learning_curve_cv_folds_list, list) or not all( [isinstance(fold, int) for fold in learning_curve_cv_folds_list]) or not len(learning_curve_cv_folds_list) == len(learners)): raise ValueError( "The learning_curve_cv_folds parameter should " "be a list of integers of the same length as " "the number of learners. You specified: {}".format( learning_curve_cv_folds_list)) # get the value for learning_curve_train_sizes and ensure # that it's a list of either integers (sizes) or # floats (proportions). If it's not specified, then we just # assume that we are using np.linspace(0.1, 1.0, 5). learning_curve_train_sizes_string = config.get( "Input", "learning_curve_train_sizes") learning_curve_train_sizes = yaml.safe_load( fix_json(learning_curve_train_sizes_string)) if len(learning_curve_train_sizes) == 0: learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist() else: if (not isinstance(learning_curve_train_sizes, list) or not all([ isinstance(size, int) or isinstance(size, float) for size in learning_curve_train_sizes ])): raise ValueError( "The learning_curve_train_sizes parameter should " "be a list of integers or floats. You specified: {}".format( learning_curve_train_sizes)) # do we need to shuffle the training data do_shuffle = config.getboolean("Input", "shuffle") fixed_parameter_list = yaml.safe_load( fix_json(config.get("Input", "fixed_parameters"))) fixed_sampler_parameters = fix_json( config.get("Input", "sampler_parameters")) fixed_sampler_parameters = yaml.safe_load(fixed_sampler_parameters) param_grid_list = yaml.safe_load( fix_json(config.get("Tuning", "param_grids"))) # read and normalize the value of `pos_label_str` pos_label_str = safe_float(config.get("Tuning", "pos_label_str")) if pos_label_str == '': pos_label_str = None # ensure that feature_scaling is specified only as one of the # four available choices feature_scaling = config.get("Input", "feature_scaling") if feature_scaling not in VALID_FEATURE_SCALING_OPTIONS: raise ValueError( "Invalid value for feature_scaling parameter: {}".format( feature_scaling)) suffix = config.get("Input", "suffix") label_col = config.get("Input", "label_col") id_col = config.get("Input", "id_col") ids_to_floats = config.getboolean("Input", "ids_to_floats") # if an external folds file is specified, then read it into a dictionary folds_file = locate_file(config.get("Input", "folds_file"), config_dir) num_cv_folds = config.getint("Input", "num_cv_folds") specified_folds_mapping = None specified_num_folds = None if folds_file: specified_folds_mapping = load_cv_folds(folds_file, ids_to_floats=ids_to_floats) else: # if no file is specified, then set the number of folds for cross-validation specified_num_folds = num_cv_folds if num_cv_folds else 10 # whether or not to save the cv fold ids/models save_cv_folds = config.getboolean("Output", "save_cv_folds") save_cv_models = config.getboolean("Output", "save_cv_models") # whether or not to do stratified cross validation random_folds = config.getboolean("Input", "random_folds") if random_folds: if folds_file: logger.warning('Specifying "folds_file" overrides "random_folds".') do_stratified_folds = False else: do_stratified_folds = True # get all the input paths and directories (without trailing slashes) train_path = config.get("Input", "train_directory").rstrip(os.sep) test_path = config.get("Input", "test_directory").rstrip(os.sep) train_file = config.get("Input", "train_file") test_file = config.get("Input", "test_file") # make sure that featuresets is not an empty list unless # train_file and test_file are specified if not train_file and not test_file and (isinstance(featuresets, list) and len(featuresets) == 0): raise ValueError( "The 'featuresets' parameters cannot be an empty list.") # The user must specify either train_file or train_path, not both. if not train_file and not train_path: raise ValueError('Invalid [Input] parameters: either "train_file" or ' '"train_directory" must be specified in the ' 'configuration file.') # Either train_file or train_path must be specified. if train_file and train_path: raise ValueError('Invalid [Input] parameters: only either "train_file"' ' or "train_directory" can be specified in the ' 'configuration file, not both.') # Cannot specify both test_file and test_path if test_file and test_path: raise ValueError('Invalid [Input] parameters: only either "test_file" ' 'or "test_directory" can be specified in the ' 'configuration file, not both.') # if train_file is specified, then assign its value to train_path # this is a workaround to make this simple use case (a single train and # test file) compatible with the existing architecture using # featuresets if train_file: train_path = train_file featuresets = [['train_{}'.format(basename(train_file))]] suffix = '' # if test_file is specified, then assign its value to test_path to # enable compatibility with the pre-existing featuresets architecture if test_file: test_path = test_file featuresets[0][0] += '_test_{}'.format(basename(test_file)) # make sure all the specified paths/files exist train_path = locate_file(train_path, config_dir) test_path = locate_file(test_path, config_dir) # Get class mapping dictionary if specified class_map_string = config.get("Input", "class_map") original_class_map = yaml.safe_load(fix_json(class_map_string)) if original_class_map: # Change class_map to map from originals to replacements instead of # from replacement to list of originals class_map = {} for replacement, original_list in original_class_map.items(): for original in original_list: class_map[original] = replacement del original_class_map else: class_map = None ##################### # 3. Output section # ##################### probability = config.getboolean("Output", "probability") pipeline = config.getboolean("Output", "pipeline") # do we want to keep the predictions? # make sure the predictions path exists and if not create it try: prediction_dir = locate_file(config.get("Output", "predictions"), config_dir) except IOError as e: if e.errno == errno.ENOENT: prediction_dir = e.filename os.makedirs(prediction_dir) # make sure model path exists and if not, create it try: model_path = locate_file(config.get("Output", "models"), config_dir) except IOError as e: if e.errno == errno.ENOENT: model_path = e.filename os.makedirs(model_path) # make sure results path exists try: results_path = locate_file(config.get("Output", "results"), config_dir) except IOError as e: if e.errno == errno.ENOENT: results_path = e.filename os.makedirs(results_path) # what are the output metrics? output_metrics = config.get("Output", "metrics") output_metrics = _parse_and_validate_metrics(output_metrics, 'metrics', logger=logger) ##################### # 4. Tuning section # ##################### # do we need to run a grid search for the hyperparameters or are we just # using the defaults? do_grid_search = config.getboolean("Tuning", "grid_search") # parse any provided grid objective functions grid_objectives = config.get("Tuning", "objectives") grid_objectives = _parse_and_validate_metrics(grid_objectives, 'objectives', logger=logger) # if we are doing learning curves , we don't care about # grid search if task == 'learning_curve' and do_grid_search: do_grid_search = False logger.warning("Grid search is not supported during " "learning curve generation. Disabling.") # Check if `param_grids` is specified, but `do_grid_search` is False if param_grid_list and not do_grid_search: logger.warning('Since "grid_search" is set to False, the specified' ' "param_grids" will be ignored.') # Warn user about potential conflicts between parameter values # specified in `fixed_parameter_list` and values specified in # `param_grid_list` (or values passed in by default) if # `do_grid_search` is True if do_grid_search and fixed_parameter_list: logger.warning('Note that "grid_search" is set to True and ' '"fixed_parameters" is also specified. If there ' 'is a conflict between the grid search parameter' ' space and the fixed parameter values, the ' 'fixed parameter values will take precedence.') # minimum number of examples a feature must be nonzero in to be included min_feature_count = config.getint("Tuning", "min_feature_count") # if an external folds file was specified do we use the same folds file # for the inner grid-search in cross-validate as well? use_folds_file_for_grid_search = config.getboolean( "Tuning", "use_folds_file_for_grid_search") # how many jobs should we run in parallel for grid search grid_search_jobs = config.getint("Tuning", "grid_search_jobs") if not grid_search_jobs: grid_search_jobs = None # how many folds should we run in parallel for grid search grid_search_folds = config.getint("Tuning", "grid_search_folds") # check whether the right things are set for the given task if (task == 'evaluate' or task == 'predict') and not test_path: raise ValueError('The test set must be set when task is evaluate or ' 'predict.') if task in ['cross_validate', 'evaluate', 'train']: if do_grid_search and len(grid_objectives) == 0: raise ValueError( 'Grid search is on. Either specify a list of tuning ' 'objectives or set `grid_search` to `false` in the ' 'Tuning section.') if not do_grid_search and len(grid_objectives) > 0: logger.warning('Since "grid_search" is set to False, any specified' ' "objectives" will be ignored.') grid_objectives = [] if task in ['cross_validate', 'train', 'learning_curve'] and test_path: raise ValueError('The test set should not be set when task is ' '{}.'.format(task)) if task in ['train', 'predict'] and results_path and not do_grid_search: raise ValueError('The results path should not be set when task is ' '{} and "grid_search" is set to False.'.format(task)) if task == 'train' and not model_path: raise ValueError('The model path should be set when task is train.') if task in ['learning_curve', 'train'] and prediction_dir: raise ValueError('The predictions path should not be set when task is ' '{}.'.format(task)) if task == 'learning_curve' and model_path: raise ValueError('The models path should not be set when task is ' 'learning_curve.') if task == 'learning_curve': if len(grid_objectives) > 0: raise ValueError("The \"objectives\" option " "is no longer supported for the " "\"learning_curve\" " "task. Please use the \"metrics\" " "option in the [Output] " "section instead.") if len(output_metrics) == 0: raise ValueError('The "metrics" option must be set when ' 'the task is "learning_curve".') # if any of the objectives or metrics require probabilities to be output, # probability must be specified as true specified_probabilistic_metrics = PROBABILISTIC_METRICS.intersection( grid_objectives + output_metrics) if specified_probabilistic_metrics and not probability: raise ValueError("The 'probability' option must be 'true' " " to compute the following: " "{}.".format(list(specified_probabilistic_metrics))) # set the folds appropriately based on the task: # (a) if the task is `train`/`evaluate`/`predict` and if an external # fold mapping is specified then use that mapping for grid search # instead of the value contained in `grid_search_folds`. # (b) if the task is `cross_validate` and an external fold mapping is specified # then use that mapping for the outer CV loop and for the inner grid-search # loop. However, if `use_folds_file_for_grid_search` is `False`, do not # use the fold mapping for the inner loop. cv_folds = None if task in ['train', 'evaluate', 'predict'] and specified_folds_mapping: grid_search_folds = specified_folds_mapping # only print out the warning if the user actually wants to do grid search if do_grid_search: logger.warning("Specifying \"folds_file\" overrides both " "explicit and default \"grid_search_folds\".") if task == 'cross_validate': cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds if specified_folds_mapping: logger.warning("Specifying \"folds_file\" overrides both " "explicit and default \"num_cv_folds\".") if use_folds_file_for_grid_search: grid_search_folds = cv_folds else: # only print out the warning if the user wants to do grid search if do_grid_search: logger.warning("The specified \"folds_file\" will " "not be used for inner grid search.") if save_cv_models is True and not model_path: raise ValueError("Output directory for models must be set if " "\"save_cv_models\" is set to true.") # Create feature set names if unspecified if not featureset_names: featureset_names = [_munge_featureset_name(x) for x in featuresets] if len(featureset_names) != len(featuresets): raise ValueError(('Number of feature set names (%s) does not match ' 'number of feature sets (%s).') % (len(featureset_names), len(featuresets))) # store training/test set names for later use train_set_name = basename(train_path) test_set_name = basename(test_path) if test_path else "cv" return (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, pipeline, results_path, pos_label_str, feature_scaling, min_feature_count, folds_file, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, save_cv_models, use_folds_file_for_grid_search, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, custom_metric_path, learning_curve_cv_folds_list, learning_curve_train_sizes, output_metrics)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and removes any instances or " "features that do not match the specified patterns.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, ' '.jsonlines, .ndj, or .tsv)') parser.add_argument( 'outfile', help='output feature file (must have same extension as ' 'input file)') parser.add_argument('-f', '--feature', help='A feature in the feature file you would like to ' 'keep. If unspecified, no features are removed.', nargs='*') parser.add_argument('-I', '--id', help='An instance ID in the feature file you would ' 'like to keep. If unspecified, no instances are ' 'removed based on their IDs.', nargs='*') parser.add_argument('--id_col', help='Name of the column which contains the instance ' 'IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-i', '--inverse', help='Instead of keeping features and/or examples in ' 'lists, remove them.', action='store_true') parser.add_argument('-L', '--label', help='A label in the feature file you would like to ' 'keep. If unspecified, no instances are removed ' 'based on their labels.', nargs='*') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class ' 'labels in ARFF, CSV, or TSV files. For ARFF ' 'files, this must be the final column to count as ' 'the label.', default='y') parser.add_argument( '-rb', '--replace_blanks_with', help='Specifies a new value with which to replace blank values ' 'in all columns in the file. To replace blanks differently ' 'in each column, use the SKLL Reader API directly.', default=None) parser.add_argument('-db', '--drop_blanks', action='store_true', help='Drop all lines/rows that have any blank values.', default=False) parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # all extensions except .libsvm can be processed valid_extensions = {ext for ext in EXT_TO_READER if ext != '.libsvm'} # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension == '.libsvm': logger.error('Cannot filter LibSVM files. Please use skll_convert to ' 'convert to a different datatype first.') sys.exit(1) if input_extension not in valid_extensions: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) if output_extension != input_extension: logger.error(('Output file must be in the same format as the input ' 'file. You specified: {}').format(output_extension)) sys.exit(1) if input_extension == '.csv' or input_extension == '.tsv': replace_blanks_with = args.replace_blanks_with drop_blanks = args.drop_blanks if drop_blanks and replace_blanks_with is not None: raise ValueError( "You cannot both drop blanks and replace them. " "'replace_blanks_with' can only have a value when " "'drop_blanks' is `False`.") replace_blanks_with = (None if replace_blanks_with is None else safe_float(replace_blanks_with)) kwargs = { 'replace_blanks_with': replace_blanks_with, 'drop_blanks': drop_blanks } else: kwargs = {} # Read input file reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=args.label_col, id_col=args.id_col, **kwargs) feature_set = reader.read() # Do the actual filtering feature_set.filter(ids=args.id, labels=args.label, features=args.feature, inverse=args.inverse) # write out the file in the requested output format writer_type = EXT_TO_WRITER[input_extension] writer_args = {'quiet': args.quiet} if writer_type is CSVWriter or writer_type is TSVWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col writer_args['regression'] = reader.regression writer_args['relation'] = reader.relation writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()