def _check_job_results(job_results): """ See if we have a complete results dictionary for every job. Parameters ---------- job_results : list of dicts A list of job result dictionaries. """ logger = get_skll_logger('experiment') logger.info('Checking job results') for result_dicts in job_results: if not result_dicts or 'task' not in result_dicts[0]: logger.error('There was an error running the experiment:\n%s', result_dicts)
def _write_summary_file(result_json_paths, output_file, ablation=0): """ Function to take a list of paths to individual result json files and returns a single file that summarizes all of them. Parameters ---------- result_json_paths : list of str A list of paths to the individual result JSON files. output_file : str The path to the output file (TSV format). ablation : int, optional The number of features to remove when doing ablation experiment. Defaults to 0. """ learner_result_dicts = [] # Map from feature set names to all features in them all_features = defaultdict(set) logger = get_skll_logger('experiment') for json_path in result_json_paths: if not exists(json_path): logger.error(('JSON results file %s not found. Skipping summary ' 'creation. You can manually create the summary file' ' after the fact by using the summarize_results ' 'script.'), json_path) return else: with open(json_path, 'r') as json_file: obj = json.load(json_file) featureset_name = obj[0]['featureset_name'] if ablation != 0 and '_minus_' in featureset_name: parent_set = featureset_name.split('_minus_', 1)[0] all_features[parent_set].update( yaml.safe_load(obj[0]['featureset'])) learner_result_dicts.extend(obj) # Build and write header header = set(learner_result_dicts[0].keys()) - {'result_table', 'descriptive'} if ablation != 0: header.add('ablated_features') header = sorted(header) writer = csv.DictWriter(output_file, header, extrasaction='ignore', dialect=csv.excel_tab) writer.writeheader() # Build "ablated_features" list and fix some backward compatible things for lrd in learner_result_dicts: featureset_name = lrd['featureset_name'] if ablation != 0: parent_set = featureset_name.split('_minus_', 1)[0] ablated_features = all_features[parent_set].difference( yaml.safe_load(lrd['featureset'])) lrd['ablated_features'] = '' if ablated_features: lrd['ablated_features'] = json.dumps(sorted(ablated_features)) # write out the new learner dict with the readable fields writer.writerow(lrd) output_file.flush()
def _write_learning_curve_file(result_json_paths, output_file): """ Function to take a list of paths to individual learning curve results json files and writes out a single TSV file with the learning curve data. Parameters ---------- result_json_paths : list of str A list of paths to the individual result JSON files. output_file : str The path to the output file (TSV format). """ learner_result_dicts = [] # Map from feature set names to all features in them logger = get_skll_logger('experiment') for json_path in result_json_paths: if not exists(json_path): logger.error(('JSON results file %s not found. Skipping summary ' 'creation. You can manually create the summary file' ' after the fact by using the summarize_results ' 'script.'), json_path) return else: with open(json_path, 'r') as json_file: obj = json.load(json_file) learner_result_dicts.extend(obj) # Build and write header header = ['featureset_name', 'learner_name', 'metric', 'train_set_name', 'training_set_size', 'train_score_mean', 'test_score_mean', 'train_score_std', 'test_score_std', 'scikit_learn_version', 'version'] writer = csv.DictWriter(output_file, header, extrasaction='ignore', dialect=csv.excel_tab) writer.writeheader() # write out the fields we need for the learning curve file # specifically, we need to separate out the curve sizes # and scores into individual entries. for lrd in learner_result_dicts: training_set_sizes = lrd['computed_curve_train_sizes'] train_scores_means_by_size = lrd['learning_curve_train_scores_means'] test_scores_means_by_size = lrd['learning_curve_test_scores_means'] train_scores_stds_by_size = lrd['learning_curve_train_scores_stds'] test_scores_stds_by_size = lrd['learning_curve_test_scores_stds'] # rename `grid_objective` to `metric` since the latter name can be confusing lrd['metric'] = lrd['grid_objective'] for (size, train_score_mean, test_score_mean, train_score_std, test_score_std) in zip(training_set_sizes, train_scores_means_by_size, test_scores_means_by_size, train_scores_stds_by_size, test_scores_stds_by_size): lrd['training_set_size'] = size lrd['train_score_mean'] = train_score_mean lrd['test_score_mean'] = test_score_mean lrd['train_score_std'] = train_score_std lrd['test_score_std'] = test_score_std writer.writerow(lrd) output_file.flush()
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False, log_level=logging.INFO): """ Takes a configuration file and runs the specified jobs on the grid. Parameters ---------- config_file : str Path to the configuration file we would like to use. local : bool, optional Should this be run locally instead of on the cluster? Defaults to ``False``. overwrite : bool, optional If the model files already exist, should we overwrite them instead of re-using them? Defaults to ``True``. queue : str, optional The DRMAA queue to use if we're running on the cluster. Defaults to ``'all.q'``. hosts : list of str, optional If running on the cluster, these are the machines we should use. Defaults to ``None``. write_summary : bool, optional Write a TSV file with a summary of the results. Defaults to ``True``. quiet : bool, optional Suppress printing of "Loading..." messages. Defaults to ``False``. ablation : int, optional Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. Defaults to 0. resume : bool, optional If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. Defaults to ``False``. log_level : str, optional The level for logging messages. Defaults to ``logging.INFO``. Returns ------- result_json_paths : list of str A list of paths to .json results files for each variation in the experiment. Raises ------ ValueError If value for ``"ablation"`` is not a positive int or ``None``. OSError If the lenth of the ``FeatureSet`` name > 210. """ try: # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, pipeline, results_path, pos_label_str, feature_scaling, min_feature_count, folds_file, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, save_cv_models, use_folds_file_for_grid_search, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, custom_metric_path, learning_curve_cv_folds_list, learning_curve_train_sizes, output_metrics) = parse_config_file(config_file, log_level=log_level) # get the main experiment logger that will already have been # created by the configuration parser so we don't need anything # except the name `experiment`. logger = get_skll_logger('experiment') # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # No grid search or ablation for learning curve generation if task == 'learning_curve': if ablation is None or ablation > 0: ablation = 0 logger.warning("Ablating features is not supported during " "learning curve generation. Ignoring.") # if we just had a train file and a test file, there are no real featuresets # in which case there are no features to ablate if len(featuresets) == 1 and len(featuresets[0]) == 1: if ablation is None or ablation > 0: ablation = 0 logger.warning( "Not enough featuresets for ablation. Ignoring.") # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append( sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append( sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError( 'System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # if the task is learning curve, and ``metrics`` was specified, then # assign the value of ``metrics`` to ``grid_objectives`` - this lets # us piggyback on the parallelization of the objectives that is already # set up for us to use if task == 'learning_curve' and len(output_metrics) > 0: grid_objectives = output_metrics # if there were no grid objectives provided, just set it to # a list containing a single None so as to allow the parallelization # to proceeed and to pass the correct default value of grid_objective # down to _classify_featureset(). if not grid_objectives: grid_objectives = [None] # Run each featureset-learner-objective combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): for grid_objective in grid_objectives: # for the individual job name, we need to add the feature set name # and the learner name if grid_objective is None or len(grid_objectives) == 1: job_name_components = [ experiment_name, featureset_name, learner_name ] else: job_name_components = [ experiment_name, featureset_name, learner_name, grid_objective ] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and getsize(result_json_path)): logger.info( 'Running in resume mode and %s exists, ' 'so skipping job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args['output_metrics'] = output_metrics job_args["suffix"] = suffix job_args["log_file"] = logfile job_args["log_level"] = log_level job_args["probability"] = probability job_args["pipeline"] = pipeline job_args["results_path"] = results_path job_args["sampler_parameters"] = ( fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = ( fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["folds_file"] = folds_file job_args["cv_folds"] = cv_folds job_args["save_cv_folds"] = save_cv_folds job_args["save_cv_models"] = save_cv_models job_args[ "use_folds_file_for_grid_search"] = use_folds_file_for_grid_search job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path job_args["custom_metric_path"] = custom_metric_path job_args[ "learning_curve_cv_folds"] = learning_curve_cv_folds_list[ learner_num] job_args[ "learning_curve_train_sizes"] = learning_curve_train_sizes if not local: jobs.append( Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if (do_grid_search or task == 'learning_curve') else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) # Call get_skll_logger again after _classify_featureset # calls are finished so that any warnings that may # happen after this point get correctly logged to the # main logger logger = get_skll_logger('experiment') # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' with open(join(results_path, summary_file_name), 'w', newline='') as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) elif task == 'learning_curve': output_file_name = experiment_name + '_summary.tsv' output_file_path = join(results_path, output_file_name) with open(output_file_path, 'w', newline='') as output_file: _write_learning_curve_file(result_json_paths, output_file) # generate the actual plot if we have the requirements installed generate_learning_curve_plots(experiment_name, results_path, output_file_path) finally: # Close/remove any logger handlers close_and_remove_logger_handlers(get_skll_logger('experiment')) return result_json_paths
def parse_config_file(config_path, log_level=logging.INFO): """ Parses a SKLL experiment configuration file with the given path. Log messages with the given log level (default: INFO). Parameters ---------- config_path : str The path to the configuration file. log_level : logging level, optional The logging level to use. Defaults to ``logging.INFO``. Returns ------- experiment_name : str A string used to identify this particular experiment configuration. When generating result summary files, this name helps prevent overwriting previous summaries. task : str The types of experiment we're trying to run (e.g. 'cross_validate'). sampler : str The name of a sampler to perform non-linear transformations of the input. fixed_sampler_parameters : dict A dictionary containing parameters you want to have fixed for the sampler feature_hasher : bool If True, this enables a high-speed, low-memory vectorizer that uses feature hashing for converting feature dictionaries into NumPy arrays instead of using a DictVectorizer. hasher_features : int The number of features used by the FeatureHasher if the feature_hasher flag is enabled. id_col : str The column with IDs. label_col : str The column with labels. train_set_name : str The name of the training set. test_set_name : str The name of the test set. suffix : str The file format the training/test files are in. featuresets : list of str A list of lists of prefixes for the files containing the features you would like to train/test on. do_shuffle : bool Whether to shuffle the data. model_path : str The path to the model file(s). do_grid_search : bool Whether to perform grid search. grid_objectives : list of str A list of scoring functions to use for tuning. probability : bool Whether to output probabilities for each class. pipeline : bool Whether to include the `pipeline` attribute in the trained model. This will increase the size of the model file. results_path : str Path to store result files in. pos_label_str : str The string label for the positive class in the binary classification setting. feature_scaling : str How to scale features (e.g. 'with_mean'). min_feature_count : int The minimum number of examples for which the value of a feature must be nonzero to be included in the model. folds_file : str The path to the folds_file, if specified. grid_search_jobs : int Number of folds to run in parallel when using grid search. grid_search_folds : int The number of folds to use for grid search. cv_folds : dict or int The specified folds mapping, or the number of folds. save_cv_folds : bool Whether to save CV Folds to file. save_cv_models : bool Whether to save CV models. use_folds_file_for_grid_search : bool Whether to use folds file for grid search. do_stratified_folds : bool Whether to use random folds for cross-validation. fixed_parameter_list : list of dict List of dicts containing parameters you want to have fixed for each classifier in learners list. param_grid_list : list of dict List of parameter grids to search, one dict for each learner. featureset_names : list of str The names of the featuresets used for each job. learners : list of str A list of learners to try using. prediction_dir : str The directories where predictions are saved. log_path : str The path to the log file. train_path : str The path to a file containing feature to train on. test_path : str The path to a file containing features to test on. ids_to_floats : bool Whether to convert IDs to floats. class_map : dict A class map collapsing several labels into one. custom_learner_path : str Path to a .py file that defines a custom learner. custom_metric_path : str Path to a .py file that defines a custom metric. learning_curve_cv_folds_list : list of int A list of integers specifying the number of folds to use for CV. learning_curve_train_sizes : list of float or list of int List of floats or integers representing relative or absolute numbers of training examples that will be used to generate the learning curve respectively. output_metrics : list A list of output metrics to use. Raises ------ IOError If configuration file name is empty ValueError If various configuration parameters are incorrectly specified, or cause conflicts. """ # check that config_path is not empty if config_path == "": raise IOError("The name of the configuration file is empty") # compute the absolute path for the config file config_path = realpath(config_path) config_dir = dirname(config_path) # set up a config parser with the above default values config = _setup_config_parser(config_path) # extract parameters from the various sections in the config file ###################### # 1. General section # ###################### if config.has_option("General", "experiment_name"): experiment_name = config.get("General", "experiment_name") else: raise ValueError("Configuration file does not contain experiment_name " "in the [General] section.") # next, get the log path before anything else since we need to # save all logging messages to a log file in addition to displaying # them on the console try: log_path = locate_file(config.get("Output", "log"), config_dir) except IOError as e: if e.errno == errno.ENOENT: log_path = e.filename os.makedirs(log_path) # Create a top-level log file under the log path main_log_file = join(log_path, '{}.log'.format(experiment_name)) # Now create a SKLL logger that will log to this file as well # as to the console. Use the log level provided - note that # we only have to do this the first time we call `get_skll_logger()` # with a given name. logger = get_skll_logger('experiment', filepath=main_log_file, log_level=log_level) if config.has_option("General", "task"): task = config.get("General", "task") else: raise ValueError("Configuration file does not contain task in the " "[General] section.") if task not in VALID_TASKS: raise ValueError('An invalid task was specified: {}. Valid tasks are:' ' {}'.format(task, ', '.join(VALID_TASKS))) #################### # 2. Input section # #################### sampler = config.get("Input", "sampler") if sampler not in VALID_SAMPLERS: raise ValueError('An invalid sampler was specified: {}. Valid ' 'samplers are: {}'.format(sampler, ', '.join(VALID_SAMPLERS))) # produce warnings if feature_hasher is set but hasher_features # is less than or equal to zero. feature_hasher = config.getboolean("Input", "feature_hasher") hasher_features = config.getint("Input", "hasher_features") if feature_hasher: if hasher_features <= 0: raise ValueError( "Configuration file must specify a non-zero value " "for the option hasher_features when " "feature_hasher is True.") # produce warnings if hasher_features is set but feature_hasher # is not set correctly elif hasher_features > 0: logger.warning( "Ignoring hasher_features since feature_hasher is either" " missing or set to False.") if config.has_option("Input", "learners"): learners_string = config.get("Input", "learners") else: raise ValueError( "Configuration file does not contain list of learners " "in [Input] section.") learners = yaml.safe_load(fix_json(learners_string)) if len(learners) == 0: raise ValueError( "Configuration file contains an empty list of learners" " in the [Input] section.") elif len(set(learners)) < len(learners): raise ValueError( 'Configuration file contains the same learner multiple' ' times, which is not currently supported. Please use' ' param_grids with tuning to find the optimal settings' ' for the learner.') custom_learner_path = locate_file( config.get("Input", "custom_learner_path"), config_dir) # get the custom metric path, if specified, and locate it custom_metric_path = locate_file(config.get("Input", "custom_metric_path"), config_dir) # get the featuresets featuresets_string = config.get("Input", "featuresets") featuresets = yaml.safe_load(fix_json(featuresets_string)) # ensure that featuresets is either a list of features or a list of lists # of features if not isinstance(featuresets, list) or not all( isinstance(fs, list) for fs in featuresets): raise ValueError("The featuresets parameter should be a list of " "features or a list of lists of features. You " "specified: {}".format(featuresets)) featureset_names = yaml.safe_load( fix_json(config.get("Input", "featureset_names"))) # ensure that featureset_names is a list of strings, if specified if featureset_names: if (not isinstance(featureset_names, list) or not all([isinstance(fs, str) for fs in featureset_names])): raise ValueError( "The featureset_names parameter should be a list " "of strings. You specified: {}".format(featureset_names)) # get the value for learning_curve_cv_folds and ensure # that it's a list of the same length as the value of # learners. If it's not specified, then we just assume # that we are using 10 folds for each learner. learning_curve_cv_folds_list_string = config.get( "Input", "learning_curve_cv_folds_list") learning_curve_cv_folds_list = yaml.safe_load( fix_json(learning_curve_cv_folds_list_string)) if len(learning_curve_cv_folds_list) == 0: learning_curve_cv_folds_list = [10] * len(learners) else: if (not isinstance(learning_curve_cv_folds_list, list) or not all( [isinstance(fold, int) for fold in learning_curve_cv_folds_list]) or not len(learning_curve_cv_folds_list) == len(learners)): raise ValueError( "The learning_curve_cv_folds parameter should " "be a list of integers of the same length as " "the number of learners. You specified: {}".format( learning_curve_cv_folds_list)) # get the value for learning_curve_train_sizes and ensure # that it's a list of either integers (sizes) or # floats (proportions). If it's not specified, then we just # assume that we are using np.linspace(0.1, 1.0, 5). learning_curve_train_sizes_string = config.get( "Input", "learning_curve_train_sizes") learning_curve_train_sizes = yaml.safe_load( fix_json(learning_curve_train_sizes_string)) if len(learning_curve_train_sizes) == 0: learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist() else: if (not isinstance(learning_curve_train_sizes, list) or not all([ isinstance(size, int) or isinstance(size, float) for size in learning_curve_train_sizes ])): raise ValueError( "The learning_curve_train_sizes parameter should " "be a list of integers or floats. You specified: {}".format( learning_curve_train_sizes)) # do we need to shuffle the training data do_shuffle = config.getboolean("Input", "shuffle") fixed_parameter_list = yaml.safe_load( fix_json(config.get("Input", "fixed_parameters"))) fixed_sampler_parameters = fix_json( config.get("Input", "sampler_parameters")) fixed_sampler_parameters = yaml.safe_load(fixed_sampler_parameters) param_grid_list = yaml.safe_load( fix_json(config.get("Tuning", "param_grids"))) # read and normalize the value of `pos_label_str` pos_label_str = safe_float(config.get("Tuning", "pos_label_str")) if pos_label_str == '': pos_label_str = None # ensure that feature_scaling is specified only as one of the # four available choices feature_scaling = config.get("Input", "feature_scaling") if feature_scaling not in VALID_FEATURE_SCALING_OPTIONS: raise ValueError( "Invalid value for feature_scaling parameter: {}".format( feature_scaling)) suffix = config.get("Input", "suffix") label_col = config.get("Input", "label_col") id_col = config.get("Input", "id_col") ids_to_floats = config.getboolean("Input", "ids_to_floats") # if an external folds file is specified, then read it into a dictionary folds_file = locate_file(config.get("Input", "folds_file"), config_dir) num_cv_folds = config.getint("Input", "num_cv_folds") specified_folds_mapping = None specified_num_folds = None if folds_file: specified_folds_mapping = load_cv_folds(folds_file, ids_to_floats=ids_to_floats) else: # if no file is specified, then set the number of folds for cross-validation specified_num_folds = num_cv_folds if num_cv_folds else 10 # whether or not to save the cv fold ids/models save_cv_folds = config.getboolean("Output", "save_cv_folds") save_cv_models = config.getboolean("Output", "save_cv_models") # whether or not to do stratified cross validation random_folds = config.getboolean("Input", "random_folds") if random_folds: if folds_file: logger.warning('Specifying "folds_file" overrides "random_folds".') do_stratified_folds = False else: do_stratified_folds = True # get all the input paths and directories (without trailing slashes) train_path = config.get("Input", "train_directory").rstrip(os.sep) test_path = config.get("Input", "test_directory").rstrip(os.sep) train_file = config.get("Input", "train_file") test_file = config.get("Input", "test_file") # make sure that featuresets is not an empty list unless # train_file and test_file are specified if not train_file and not test_file and (isinstance(featuresets, list) and len(featuresets) == 0): raise ValueError( "The 'featuresets' parameters cannot be an empty list.") # The user must specify either train_file or train_path, not both. if not train_file and not train_path: raise ValueError('Invalid [Input] parameters: either "train_file" or ' '"train_directory" must be specified in the ' 'configuration file.') # Either train_file or train_path must be specified. if train_file and train_path: raise ValueError('Invalid [Input] parameters: only either "train_file"' ' or "train_directory" can be specified in the ' 'configuration file, not both.') # Cannot specify both test_file and test_path if test_file and test_path: raise ValueError('Invalid [Input] parameters: only either "test_file" ' 'or "test_directory" can be specified in the ' 'configuration file, not both.') # if train_file is specified, then assign its value to train_path # this is a workaround to make this simple use case (a single train and # test file) compatible with the existing architecture using # featuresets if train_file: train_path = train_file featuresets = [['train_{}'.format(basename(train_file))]] suffix = '' # if test_file is specified, then assign its value to test_path to # enable compatibility with the pre-existing featuresets architecture if test_file: test_path = test_file featuresets[0][0] += '_test_{}'.format(basename(test_file)) # make sure all the specified paths/files exist train_path = locate_file(train_path, config_dir) test_path = locate_file(test_path, config_dir) # Get class mapping dictionary if specified class_map_string = config.get("Input", "class_map") original_class_map = yaml.safe_load(fix_json(class_map_string)) if original_class_map: # Change class_map to map from originals to replacements instead of # from replacement to list of originals class_map = {} for replacement, original_list in original_class_map.items(): for original in original_list: class_map[original] = replacement del original_class_map else: class_map = None ##################### # 3. Output section # ##################### probability = config.getboolean("Output", "probability") pipeline = config.getboolean("Output", "pipeline") # do we want to keep the predictions? # make sure the predictions path exists and if not create it try: prediction_dir = locate_file(config.get("Output", "predictions"), config_dir) except IOError as e: if e.errno == errno.ENOENT: prediction_dir = e.filename os.makedirs(prediction_dir) # make sure model path exists and if not, create it try: model_path = locate_file(config.get("Output", "models"), config_dir) except IOError as e: if e.errno == errno.ENOENT: model_path = e.filename os.makedirs(model_path) # make sure results path exists try: results_path = locate_file(config.get("Output", "results"), config_dir) except IOError as e: if e.errno == errno.ENOENT: results_path = e.filename os.makedirs(results_path) # what are the output metrics? output_metrics = config.get("Output", "metrics") output_metrics = _parse_and_validate_metrics(output_metrics, 'metrics', logger=logger) ##################### # 4. Tuning section # ##################### # do we need to run a grid search for the hyperparameters or are we just # using the defaults? do_grid_search = config.getboolean("Tuning", "grid_search") # parse any provided grid objective functions grid_objectives = config.get("Tuning", "objectives") grid_objectives = _parse_and_validate_metrics(grid_objectives, 'objectives', logger=logger) # if we are doing learning curves , we don't care about # grid search if task == 'learning_curve' and do_grid_search: do_grid_search = False logger.warning("Grid search is not supported during " "learning curve generation. Disabling.") # Check if `param_grids` is specified, but `do_grid_search` is False if param_grid_list and not do_grid_search: logger.warning('Since "grid_search" is set to False, the specified' ' "param_grids" will be ignored.') # Warn user about potential conflicts between parameter values # specified in `fixed_parameter_list` and values specified in # `param_grid_list` (or values passed in by default) if # `do_grid_search` is True if do_grid_search and fixed_parameter_list: logger.warning('Note that "grid_search" is set to True and ' '"fixed_parameters" is also specified. If there ' 'is a conflict between the grid search parameter' ' space and the fixed parameter values, the ' 'fixed parameter values will take precedence.') # minimum number of examples a feature must be nonzero in to be included min_feature_count = config.getint("Tuning", "min_feature_count") # if an external folds file was specified do we use the same folds file # for the inner grid-search in cross-validate as well? use_folds_file_for_grid_search = config.getboolean( "Tuning", "use_folds_file_for_grid_search") # how many jobs should we run in parallel for grid search grid_search_jobs = config.getint("Tuning", "grid_search_jobs") if not grid_search_jobs: grid_search_jobs = None # how many folds should we run in parallel for grid search grid_search_folds = config.getint("Tuning", "grid_search_folds") # check whether the right things are set for the given task if (task == 'evaluate' or task == 'predict') and not test_path: raise ValueError('The test set must be set when task is evaluate or ' 'predict.') if task in ['cross_validate', 'evaluate', 'train']: if do_grid_search and len(grid_objectives) == 0: raise ValueError( 'Grid search is on. Either specify a list of tuning ' 'objectives or set `grid_search` to `false` in the ' 'Tuning section.') if not do_grid_search and len(grid_objectives) > 0: logger.warning('Since "grid_search" is set to False, any specified' ' "objectives" will be ignored.') grid_objectives = [] if task in ['cross_validate', 'train', 'learning_curve'] and test_path: raise ValueError('The test set should not be set when task is ' '{}.'.format(task)) if task in ['train', 'predict'] and results_path and not do_grid_search: raise ValueError('The results path should not be set when task is ' '{} and "grid_search" is set to False.'.format(task)) if task == 'train' and not model_path: raise ValueError('The model path should be set when task is train.') if task in ['learning_curve', 'train'] and prediction_dir: raise ValueError('The predictions path should not be set when task is ' '{}.'.format(task)) if task == 'learning_curve' and model_path: raise ValueError('The models path should not be set when task is ' 'learning_curve.') if task == 'learning_curve': if len(grid_objectives) > 0: raise ValueError("The \"objectives\" option " "is no longer supported for the " "\"learning_curve\" " "task. Please use the \"metrics\" " "option in the [Output] " "section instead.") if len(output_metrics) == 0: raise ValueError('The "metrics" option must be set when ' 'the task is "learning_curve".') # if any of the objectives or metrics require probabilities to be output, # probability must be specified as true specified_probabilistic_metrics = PROBABILISTIC_METRICS.intersection( grid_objectives + output_metrics) if specified_probabilistic_metrics and not probability: raise ValueError("The 'probability' option must be 'true' " " to compute the following: " "{}.".format(list(specified_probabilistic_metrics))) # set the folds appropriately based on the task: # (a) if the task is `train`/`evaluate`/`predict` and if an external # fold mapping is specified then use that mapping for grid search # instead of the value contained in `grid_search_folds`. # (b) if the task is `cross_validate` and an external fold mapping is specified # then use that mapping for the outer CV loop and for the inner grid-search # loop. However, if `use_folds_file_for_grid_search` is `False`, do not # use the fold mapping for the inner loop. cv_folds = None if task in ['train', 'evaluate', 'predict'] and specified_folds_mapping: grid_search_folds = specified_folds_mapping # only print out the warning if the user actually wants to do grid search if do_grid_search: logger.warning("Specifying \"folds_file\" overrides both " "explicit and default \"grid_search_folds\".") if task == 'cross_validate': cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds if specified_folds_mapping: logger.warning("Specifying \"folds_file\" overrides both " "explicit and default \"num_cv_folds\".") if use_folds_file_for_grid_search: grid_search_folds = cv_folds else: # only print out the warning if the user wants to do grid search if do_grid_search: logger.warning("The specified \"folds_file\" will " "not be used for inner grid search.") if save_cv_models is True and not model_path: raise ValueError("Output directory for models must be set if " "\"save_cv_models\" is set to true.") # Create feature set names if unspecified if not featureset_names: featureset_names = [_munge_featureset_name(x) for x in featuresets] if len(featureset_names) != len(featuresets): raise ValueError(('Number of feature set names (%s) does not match ' 'number of feature sets (%s).') % (len(featureset_names), len(featuresets))) # store training/test set names for later use train_set_name = basename(train_path) test_set_name = basename(test_path) if test_path else "cv" return (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, pipeline, results_path, pos_label_str, feature_scaling, min_feature_count, folds_file, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, save_cv_models, use_folds_file_for_grid_search, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, custom_metric_path, learning_curve_cv_folds_list, learning_curve_train_sizes, output_metrics)