Exemplo n.º 1
0
def test_safe_float_conversion():
    for input_val, expected_val in zip(['1.234', 1.234, '3.0', '3', 3, 'foo'],
                                       [1.234, 1.234, 3.0, 3, 3, 'foo']):
        yield check_safe_float_conversion, safe_float(input_val), expected_val
Exemplo n.º 2
0
def parse_config_file(config_path, log_level=logging.INFO):
    """
    Parses a SKLL experiment configuration file with the given path.
    Log messages with the given log level (default: INFO).

    Parameters
    ----------
    config_path : str
        The path to the configuration file.
    log_level : logging level, optional
        The logging level to use.
        Defaults to ``logging.INFO``.

    Returns
    -------
    experiment_name : str
        A string used to identify this particular experiment configuration.
        When generating result summary files, this name helps prevent
        overwriting previous summaries.
    task : str
        The types of experiment we're trying to run (e.g. 'cross_validate').
    sampler : str
        The name of a sampler to perform non-linear transformations of the input.
    fixed_sampler_parameters : dict
        A dictionary containing parameters you want to have fixed for the sampler
    feature_hasher : bool
        If True, this enables a high-speed, low-memory vectorizer that uses
        feature hashing for converting feature dictionaries into NumPy arrays
        instead of using a DictVectorizer.
    hasher_features : int
        The number of features used by the FeatureHasher if the feature_hasher
        flag is enabled.
    id_col : str
        The column with IDs.
    label_col : str
        The column with labels.
    train_set_name : str
        The name of the training set.
    test_set_name : str
        The name of the test set.
    suffix : str
        The file format the training/test files are in.
    featuresets : list of str
        A list of lists of prefixes for the files containing
        the features you would like to train/test on.
    do_shuffle : bool
        Whether to shuffle the data.
    model_path : str
        The path to the model file(s).
    do_grid_search : bool
        Whether to perform grid search.
    grid_objectives : list of str
        A list of scoring functions to use for tuning.
    probability : bool
        Whether to output probabilities for each class.
    pipeline : bool
        Whether to include the `pipeline` attribute in the
        trained model. This will increase the size of the
        model file.
    results_path : str
        Path to store result files in.
    pos_label_str : str
        The string label for the positive class in the binary
        classification setting.
    feature_scaling : str
        How to scale features (e.g. 'with_mean').
    min_feature_count : int
        The minimum number of examples for which the value of a
        feature must be nonzero to be included in the model.
    folds_file : str
        The path to the folds_file, if specified.
    grid_search_jobs : int
        Number of folds to run in parallel when using grid search.
    grid_search_folds : int
        The number of folds to use for grid search.
    cv_folds : dict or int
        The specified folds mapping, or the number of folds.
    save_cv_folds : bool
        Whether to save CV Folds to file.
    save_cv_models : bool
        Whether to save CV models.
    use_folds_file_for_grid_search : bool
        Whether to use folds file for grid search.
    do_stratified_folds : bool
        Whether to use random folds for cross-validation.
    fixed_parameter_list : list of dict
        List of dicts containing parameters you want to have fixed for
        each classifier in learners list.
    param_grid_list : list of dict
        List of parameter grids to search, one dict for each learner.
    featureset_names : list of str
        The names of the featuresets used for each job.
    learners : list of str
        A list of learners to try using.
    prediction_dir : str
        The directories where predictions are saved.
    log_path : str
        The path to the log file.
    train_path : str
        The path to a file containing feature to train on.
    test_path : str
        The path to a file containing features to test on.
    ids_to_floats : bool
        Whether to convert IDs to floats.
    class_map : dict
        A class map collapsing several labels into one.
    custom_learner_path : str
        Path to a .py file that defines a custom learner.
    custom_metric_path : str
        Path to a .py file that defines a custom metric.
    learning_curve_cv_folds_list : list of int
        A list of integers specifying the number of folds to use for CV.
    learning_curve_train_sizes : list of float or list of int
        List of floats or integers representing relative or absolute numbers
        of training examples that will be used to generate the learning
        curve respectively.
    output_metrics : list
        A list of output metrics to use.

    Raises
    ------
    IOError
        If configuration file name is empty
    ValueError
        If various configuration parameters are incorrectly specified,
        or cause conflicts.
    """

    # check that config_path is not empty
    if config_path == "":
        raise IOError("The name of the configuration file is empty")

    # compute the absolute path for the config file
    config_path = realpath(config_path)
    config_dir = dirname(config_path)

    # set up a config parser with the above default values
    config = _setup_config_parser(config_path)

    # extract parameters from the various sections in the config file

    ######################
    # 1. General section #
    ######################
    if config.has_option("General", "experiment_name"):
        experiment_name = config.get("General", "experiment_name")
    else:
        raise ValueError("Configuration file does not contain experiment_name "
                         "in the [General] section.")

    # next, get the log path before anything else since we need to
    # save all logging messages to a log file in addition to displaying
    # them on the console
    try:
        log_path = locate_file(config.get("Output", "log"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            log_path = e.filename
            os.makedirs(log_path)

    # Create a top-level log file under the log path
    main_log_file = join(log_path, '{}.log'.format(experiment_name))

    # Now create a SKLL logger that will log to this file as well
    # as to the console. Use the log level provided - note that
    # we only have to do this the first time we call `get_skll_logger()`
    # with a given name.
    logger = get_skll_logger('experiment',
                             filepath=main_log_file,
                             log_level=log_level)

    if config.has_option("General", "task"):
        task = config.get("General", "task")
    else:
        raise ValueError("Configuration file does not contain task in the "
                         "[General] section.")
    if task not in VALID_TASKS:
        raise ValueError('An invalid task was specified: {}.  Valid tasks are:'
                         ' {}'.format(task, ', '.join(VALID_TASKS)))

    ####################
    # 2. Input section #
    ####################
    sampler = config.get("Input", "sampler")
    if sampler not in VALID_SAMPLERS:
        raise ValueError('An invalid sampler was specified: {}.  Valid '
                         'samplers are: {}'.format(sampler,
                                                   ', '.join(VALID_SAMPLERS)))

    # produce warnings if feature_hasher is set but hasher_features
    # is less than or equal to zero.
    feature_hasher = config.getboolean("Input", "feature_hasher")
    hasher_features = config.getint("Input", "hasher_features")
    if feature_hasher:
        if hasher_features <= 0:
            raise ValueError(
                "Configuration file must specify a non-zero value "
                "for the option hasher_features when "
                "feature_hasher is True.")

    # produce warnings if hasher_features is set but feature_hasher
    # is not set correctly
    elif hasher_features > 0:
        logger.warning(
            "Ignoring hasher_features since feature_hasher is either"
            " missing or set to False.")

    if config.has_option("Input", "learners"):
        learners_string = config.get("Input", "learners")
    else:
        raise ValueError(
            "Configuration file does not contain list of learners "
            "in [Input] section.")
    learners = yaml.safe_load(fix_json(learners_string))

    if len(learners) == 0:
        raise ValueError(
            "Configuration file contains an empty list of learners"
            " in the [Input] section.")

    elif len(set(learners)) < len(learners):
        raise ValueError(
            'Configuration file contains the same learner multiple'
            ' times, which is not currently supported.  Please use'
            ' param_grids with tuning to find the optimal settings'
            ' for the learner.')
    custom_learner_path = locate_file(
        config.get("Input", "custom_learner_path"), config_dir)

    # get the custom metric path, if specified, and locate it
    custom_metric_path = locate_file(config.get("Input", "custom_metric_path"),
                                     config_dir)

    # get the featuresets
    featuresets_string = config.get("Input", "featuresets")
    featuresets = yaml.safe_load(fix_json(featuresets_string))

    # ensure that featuresets is either a list of features or a list of lists
    # of features
    if not isinstance(featuresets, list) or not all(
            isinstance(fs, list) for fs in featuresets):
        raise ValueError("The featuresets parameter should be a list of "
                         "features or a list of lists of features. You "
                         "specified: {}".format(featuresets))

    featureset_names = yaml.safe_load(
        fix_json(config.get("Input", "featureset_names")))

    # ensure that featureset_names is a list of strings, if specified
    if featureset_names:
        if (not isinstance(featureset_names, list)
                or not all([isinstance(fs, str) for fs in featureset_names])):
            raise ValueError(
                "The featureset_names parameter should be a list "
                "of strings. You specified: {}".format(featureset_names))

    # get the value for learning_curve_cv_folds and ensure
    # that it's a list of the same length as the value of
    # learners. If it's not specified, then we just assume
    # that we are using 10 folds for each learner.
    learning_curve_cv_folds_list_string = config.get(
        "Input", "learning_curve_cv_folds_list")
    learning_curve_cv_folds_list = yaml.safe_load(
        fix_json(learning_curve_cv_folds_list_string))
    if len(learning_curve_cv_folds_list) == 0:
        learning_curve_cv_folds_list = [10] * len(learners)
    else:
        if (not isinstance(learning_curve_cv_folds_list, list) or not all(
            [isinstance(fold, int) for fold in learning_curve_cv_folds_list])
                or not len(learning_curve_cv_folds_list) == len(learners)):
            raise ValueError(
                "The learning_curve_cv_folds parameter should "
                "be a list of integers of the same length as "
                "the number of learners. You specified: {}".format(
                    learning_curve_cv_folds_list))

    # get the value for learning_curve_train_sizes and ensure
    # that it's a list of either integers (sizes) or
    # floats (proportions). If it's not specified, then we just
    # assume that we are using np.linspace(0.1, 1.0, 5).
    learning_curve_train_sizes_string = config.get(
        "Input", "learning_curve_train_sizes")
    learning_curve_train_sizes = yaml.safe_load(
        fix_json(learning_curve_train_sizes_string))
    if len(learning_curve_train_sizes) == 0:
        learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist()
    else:
        if (not isinstance(learning_curve_train_sizes, list) or not all([
                isinstance(size, int) or isinstance(size, float)
                for size in learning_curve_train_sizes
        ])):
            raise ValueError(
                "The learning_curve_train_sizes parameter should "
                "be a list of integers or floats. You specified: {}".format(
                    learning_curve_train_sizes))

    # do we need to shuffle the training data
    do_shuffle = config.getboolean("Input", "shuffle")

    fixed_parameter_list = yaml.safe_load(
        fix_json(config.get("Input", "fixed_parameters")))
    fixed_sampler_parameters = fix_json(
        config.get("Input", "sampler_parameters"))
    fixed_sampler_parameters = yaml.safe_load(fixed_sampler_parameters)
    param_grid_list = yaml.safe_load(
        fix_json(config.get("Tuning", "param_grids")))

    # read and normalize the value of `pos_label_str`
    pos_label_str = safe_float(config.get("Tuning", "pos_label_str"))
    if pos_label_str == '':
        pos_label_str = None

    # ensure that feature_scaling is specified only as one of the
    # four available choices
    feature_scaling = config.get("Input", "feature_scaling")
    if feature_scaling not in VALID_FEATURE_SCALING_OPTIONS:
        raise ValueError(
            "Invalid value for feature_scaling parameter: {}".format(
                feature_scaling))

    suffix = config.get("Input", "suffix")
    label_col = config.get("Input", "label_col")
    id_col = config.get("Input", "id_col")
    ids_to_floats = config.getboolean("Input", "ids_to_floats")

    # if an external folds file is specified, then read it into a dictionary
    folds_file = locate_file(config.get("Input", "folds_file"), config_dir)
    num_cv_folds = config.getint("Input", "num_cv_folds")
    specified_folds_mapping = None
    specified_num_folds = None
    if folds_file:
        specified_folds_mapping = load_cv_folds(folds_file,
                                                ids_to_floats=ids_to_floats)
    else:
        # if no file is specified, then set the number of folds for cross-validation
        specified_num_folds = num_cv_folds if num_cv_folds else 10

    # whether or not to save the cv fold ids/models
    save_cv_folds = config.getboolean("Output", "save_cv_folds")
    save_cv_models = config.getboolean("Output", "save_cv_models")

    # whether or not to do stratified cross validation
    random_folds = config.getboolean("Input", "random_folds")
    if random_folds:
        if folds_file:
            logger.warning('Specifying "folds_file" overrides "random_folds".')
        do_stratified_folds = False
    else:
        do_stratified_folds = True

    # get all the input paths and directories (without trailing slashes)
    train_path = config.get("Input", "train_directory").rstrip(os.sep)
    test_path = config.get("Input", "test_directory").rstrip(os.sep)
    train_file = config.get("Input", "train_file")
    test_file = config.get("Input", "test_file")

    # make sure that featuresets is not an empty list unless
    # train_file and test_file are specified
    if not train_file and not test_file and (isinstance(featuresets, list)
                                             and len(featuresets) == 0):
        raise ValueError(
            "The 'featuresets' parameters cannot be an empty list.")

    # The user must specify either train_file or train_path, not both.
    if not train_file and not train_path:
        raise ValueError('Invalid [Input] parameters: either "train_file" or '
                         '"train_directory" must be specified in the '
                         'configuration file.')

    # Either train_file or train_path must be specified.
    if train_file and train_path:
        raise ValueError('Invalid [Input] parameters: only either "train_file"'
                         ' or "train_directory" can be specified in the '
                         'configuration file, not both.')

    # Cannot specify both test_file and test_path
    if test_file and test_path:
        raise ValueError('Invalid [Input] parameters: only either "test_file" '
                         'or "test_directory" can be specified in the '
                         'configuration file, not both.')

    # if train_file is specified, then assign its value to train_path
    # this is a workaround to make this simple use case (a single train and
    # test file) compatible with the existing architecture using
    # featuresets
    if train_file:
        train_path = train_file
        featuresets = [['train_{}'.format(basename(train_file))]]
        suffix = ''

    # if test_file is specified, then assign its value to test_path to
    # enable compatibility with the pre-existing featuresets architecture
    if test_file:
        test_path = test_file
        featuresets[0][0] += '_test_{}'.format(basename(test_file))

    # make sure all the specified paths/files exist
    train_path = locate_file(train_path, config_dir)
    test_path = locate_file(test_path, config_dir)

    # Get class mapping dictionary if specified
    class_map_string = config.get("Input", "class_map")
    original_class_map = yaml.safe_load(fix_json(class_map_string))
    if original_class_map:
        # Change class_map to map from originals to replacements instead of
        # from replacement to list of originals
        class_map = {}
        for replacement, original_list in original_class_map.items():
            for original in original_list:
                class_map[original] = replacement
        del original_class_map
    else:
        class_map = None

    #####################
    # 3. Output section #
    #####################
    probability = config.getboolean("Output", "probability")
    pipeline = config.getboolean("Output", "pipeline")

    # do we want to keep the predictions?
    # make sure the predictions path exists and if not create it
    try:
        prediction_dir = locate_file(config.get("Output", "predictions"),
                                     config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            prediction_dir = e.filename
            os.makedirs(prediction_dir)

    # make sure model path exists and if not, create it
    try:
        model_path = locate_file(config.get("Output", "models"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            model_path = e.filename
            os.makedirs(model_path)

    # make sure results path exists
    try:
        results_path = locate_file(config.get("Output", "results"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            results_path = e.filename
            os.makedirs(results_path)

    # what are the output metrics?
    output_metrics = config.get("Output", "metrics")
    output_metrics = _parse_and_validate_metrics(output_metrics,
                                                 'metrics',
                                                 logger=logger)

    #####################
    # 4. Tuning section #
    #####################

    # do we need to run a grid search for the hyperparameters or are we just
    # using the defaults?
    do_grid_search = config.getboolean("Tuning", "grid_search")

    # parse any provided grid objective functions
    grid_objectives = config.get("Tuning", "objectives")
    grid_objectives = _parse_and_validate_metrics(grid_objectives,
                                                  'objectives',
                                                  logger=logger)

    # if we are doing learning curves , we don't care about
    # grid search
    if task == 'learning_curve' and do_grid_search:
        do_grid_search = False
        logger.warning("Grid search is not supported during "
                       "learning curve generation. Disabling.")

    # Check if `param_grids` is specified, but `do_grid_search` is False
    if param_grid_list and not do_grid_search:
        logger.warning('Since "grid_search" is set to False, the specified'
                       ' "param_grids" will be ignored.')

    # Warn user about potential conflicts between parameter values
    # specified in `fixed_parameter_list` and values specified in
    # `param_grid_list` (or values passed in by default) if
    # `do_grid_search` is True
    if do_grid_search and fixed_parameter_list:
        logger.warning('Note that "grid_search" is set to True and '
                       '"fixed_parameters" is also specified. If there '
                       'is a conflict between the grid search parameter'
                       ' space and the fixed parameter values, the '
                       'fixed parameter values will take precedence.')

    # minimum number of examples a feature must be nonzero in to be included
    min_feature_count = config.getint("Tuning", "min_feature_count")

    # if an external folds file was specified do we use the same folds file
    # for the inner grid-search in cross-validate as well?
    use_folds_file_for_grid_search = config.getboolean(
        "Tuning", "use_folds_file_for_grid_search")

    # how many jobs should we run in parallel for grid search
    grid_search_jobs = config.getint("Tuning", "grid_search_jobs")
    if not grid_search_jobs:
        grid_search_jobs = None

    # how many folds should we run in parallel for grid search
    grid_search_folds = config.getint("Tuning", "grid_search_folds")

    # check whether the right things are set for the given task
    if (task == 'evaluate' or task == 'predict') and not test_path:
        raise ValueError('The test set must be set when task is evaluate or '
                         'predict.')
    if task in ['cross_validate', 'evaluate', 'train']:
        if do_grid_search and len(grid_objectives) == 0:
            raise ValueError(
                'Grid search is on. Either specify a list of tuning '
                'objectives or set `grid_search` to `false` in the '
                'Tuning section.')
        if not do_grid_search and len(grid_objectives) > 0:
            logger.warning('Since "grid_search" is set to False, any specified'
                           ' "objectives" will be ignored.')
            grid_objectives = []
    if task in ['cross_validate', 'train', 'learning_curve'] and test_path:
        raise ValueError('The test set should not be set when task is '
                         '{}.'.format(task))
    if task in ['train', 'predict'] and results_path and not do_grid_search:
        raise ValueError('The results path should not be set when task is '
                         '{} and "grid_search" is set to False.'.format(task))
    if task == 'train' and not model_path:
        raise ValueError('The model path should be set when task is train.')
    if task in ['learning_curve', 'train'] and prediction_dir:
        raise ValueError('The predictions path should not be set when task is '
                         '{}.'.format(task))
    if task == 'learning_curve' and model_path:
        raise ValueError('The models path should not be set when task is '
                         'learning_curve.')
    if task == 'learning_curve':
        if len(grid_objectives) > 0:
            raise ValueError("The \"objectives\" option "
                             "is no longer supported for the "
                             "\"learning_curve\" "
                             "task. Please use the \"metrics\" "
                             "option in the [Output] "
                             "section instead.")
        if len(output_metrics) == 0:
            raise ValueError('The "metrics" option must be set when '
                             'the task is "learning_curve".')

    # if any of the objectives or metrics require probabilities to be output,
    # probability must be specified as true
    specified_probabilistic_metrics = PROBABILISTIC_METRICS.intersection(
        grid_objectives + output_metrics)
    if specified_probabilistic_metrics and not probability:
        raise ValueError("The 'probability' option must be 'true' "
                         " to compute the following: "
                         "{}.".format(list(specified_probabilistic_metrics)))

    # set the folds appropriately based on the task:
    #  (a) if the task is `train`/`evaluate`/`predict` and if an external
    #      fold mapping is specified then use that mapping for grid search
    #      instead of the value contained in `grid_search_folds`.
    #  (b) if the task is `cross_validate` and an external fold mapping is specified
    #      then use that mapping for the outer CV loop and for the inner grid-search
    #      loop. However, if  `use_folds_file_for_grid_search` is `False`, do not
    #      use the fold mapping for the inner loop.
    cv_folds = None
    if task in ['train', 'evaluate', 'predict'] and specified_folds_mapping:
        grid_search_folds = specified_folds_mapping
        # only print out the warning if the user actually wants to do grid search
        if do_grid_search:
            logger.warning("Specifying \"folds_file\" overrides both "
                           "explicit and default \"grid_search_folds\".")
    if task == 'cross_validate':
        cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds
        if specified_folds_mapping:
            logger.warning("Specifying \"folds_file\" overrides both "
                           "explicit and default \"num_cv_folds\".")
            if use_folds_file_for_grid_search:
                grid_search_folds = cv_folds
            else:
                # only print out the warning if the user wants to do grid search
                if do_grid_search:
                    logger.warning("The specified \"folds_file\" will "
                                   "not be used for inner grid search.")
        if save_cv_models is True and not model_path:
            raise ValueError("Output directory for models must be set if "
                             "\"save_cv_models\" is set to true.")

    # Create feature set names if unspecified
    if not featureset_names:
        featureset_names = [_munge_featureset_name(x) for x in featuresets]
    if len(featureset_names) != len(featuresets):
        raise ValueError(('Number of feature set names (%s) does not match '
                          'number of feature sets (%s).') %
                         (len(featureset_names), len(featuresets)))

    # store training/test set names for later use
    train_set_name = basename(train_path)
    test_set_name = basename(test_path) if test_path else "cv"

    return (experiment_name, task, sampler, fixed_sampler_parameters,
            feature_hasher, hasher_features, id_col, label_col, train_set_name,
            test_set_name, suffix, featuresets, do_shuffle, model_path,
            do_grid_search, grid_objectives, probability, pipeline,
            results_path, pos_label_str, feature_scaling, min_feature_count,
            folds_file, grid_search_jobs, grid_search_folds, cv_folds,
            save_cv_folds, save_cv_models, use_folds_file_for_grid_search,
            do_stratified_folds, fixed_parameter_list, param_grid_list,
            featureset_names, learners, prediction_dir, log_path, train_path,
            test_path, ids_to_floats, class_map, custom_learner_path,
            custom_metric_path, learning_curve_cv_folds_list,
            learning_curve_train_sizes, output_metrics)
Exemplo n.º 3
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and removes any instances or "
        "features that do not match the specified patterns.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, '
                        '.jsonlines, .ndj, or .tsv)')
    parser.add_argument(
        'outfile',
        help='output feature file (must have same extension as '
        'input file)')
    parser.add_argument('-f',
                        '--feature',
                        help='A feature in the feature file you would like to '
                        'keep.  If unspecified, no features are removed.',
                        nargs='*')
    parser.add_argument('-I',
                        '--id',
                        help='An instance ID in the feature file you would '
                        'like to keep.  If unspecified, no instances are '
                        'removed based on their IDs.',
                        nargs='*')
    parser.add_argument('--id_col',
                        help='Name of the column which contains the instance '
                        'IDs in ARFF, CSV, or TSV files.',
                        default='id')
    parser.add_argument('-i',
                        '--inverse',
                        help='Instead of keeping features and/or examples in '
                        'lists, remove them.',
                        action='store_true')
    parser.add_argument('-L',
                        '--label',
                        help='A label in the feature file you would like to '
                        'keep.  If unspecified, no instances are removed '
                        'based on their labels.',
                        nargs='*')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class '
                        'labels in ARFF, CSV, or TSV files. For ARFF '
                        'files, this must be the final column to count as '
                        'the label.',
                        default='y')
    parser.add_argument(
        '-rb',
        '--replace_blanks_with',
        help='Specifies a new value with which to replace blank values '
        'in all columns in the file. To replace blanks differently '
        'in each column, use the SKLL Reader API directly.',
        default=None)
    parser.add_argument('-db',
                        '--drop_blanks',
                        action='store_true',
                        help='Drop all lines/rows that have any blank values.',
                        default=False)
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # all extensions except .libsvm can be processed
    valid_extensions = {ext for ext in EXT_TO_READER if ext != '.libsvm'}

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension == '.libsvm':
        logger.error('Cannot filter LibSVM files.  Please use skll_convert to '
                     'convert to a different datatype first.')
        sys.exit(1)

    if input_extension not in valid_extensions:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    if output_extension != input_extension:
        logger.error(('Output file must be in the same format as the input '
                      'file.  You specified: {}').format(output_extension))
        sys.exit(1)

    if input_extension == '.csv' or input_extension == '.tsv':
        replace_blanks_with = args.replace_blanks_with
        drop_blanks = args.drop_blanks
        if drop_blanks and replace_blanks_with is not None:
            raise ValueError(
                "You cannot both drop blanks and replace them. "
                "'replace_blanks_with' can only have a value when "
                "'drop_blanks' is `False`.")
        replace_blanks_with = (None if replace_blanks_with is None else
                               safe_float(replace_blanks_with))
        kwargs = {
            'replace_blanks_with': replace_blanks_with,
            'drop_blanks': drop_blanks
        }
    else:
        kwargs = {}

    # Read input file
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=args.label_col,
                                            id_col=args.id_col,
                                            **kwargs)
    feature_set = reader.read()

    # Do the actual filtering
    feature_set.filter(ids=args.id,
                       labels=args.label,
                       features=args.feature,
                       inverse=args.inverse)

    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[input_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is CSVWriter or writer_type is TSVWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = reader.regression
        writer_args['relation'] = reader.relation
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
Exemplo n.º 4
0
def test_safe_float_conversion():
    for input_val, expected_val in zip(['1.234', 1.234, '3.0', '3', 3, 'foo'],
                                       [1.234, 1.234, 3.0, 3, 3, 'foo']):
        yield check_safe_float_conversion, safe_float(input_val), expected_val