Exemplos de detect_fileformat em Python, exemplos de cave.utils.helpers.detect_fileformat em Python

Exemplo n.º 1

0

Exibir arquivo

    def main_cli(self):
        """
        Main cli, implementing comparison between and analysis of Configuration-results.
        """
        # Reset logging module (needs to happen before logger initalization)
        logging.shutdown()
        reload(logging)

        # Some choice-blocks, that can be reused throughout the CLI
        p_choices = [
            "all", "ablation", "forward_selection", "fanova", "lpi", "none"
        ]
        p_sort_by_choices = ["average"] + p_choices[1:-1]
        f_choices = [
            "all", "box_violin", "correlation", "clustering", "importance",
            "none"
        ]

        parser = ArgumentParser(
            formatter_class=SmartArgsDefHelpFormatter,
            add_help=False,
            description=
            'CAVE: Configuration Assessment Vizualisation and Evaluation')

        req_opts = parser.add_mutually_exclusive_group(required=True)
        req_opts.add_argument(
            "folders",
            nargs='*',
            # strings prefixed with raw| can be manually split with \n
            help="raw|path(s) to Configurator output-directory/ies",
            default=SUPPRESS)

        req_opts.add_argument("--folders",
                              nargs='*',
                              dest='folders',
                              default=SUPPRESS,
                              help=SUPPRESS)

        cave_opts = parser.add_argument_group(
            "CAVE global options",
            "Options that configure the analysis in general and define behaviour."
        )
        cave_opts.add_argument(
            "--verbose_level",
            default="INFO",
            choices=["INFO", "DEBUG", "DEV_DEBUG", "WARNING", "OFF"],
            help=
            "verbose level. use DEV_DEBUG for development to filter boilerplate-logs from "
            "imported modules, use DEBUG for full logging. full debug-log always in "
            "'output/debug/debug.log' ")
        cave_opts.add_argument(
            "--jupyter",
            default='off',
            choices=['on', 'off'],
            help="output everything to jupyter, if available.")
        cave_opts.add_argument(
            "--validation",
            default="epm",
            choices=["validation", "epm "],
            help=
            "how to complete missing runs for config/inst-pairs. epm trains random forest with "
            "available data to estimate missing runs, validation requires target algorithm. ",
            type=str.lower)
        cave_opts.add_argument(
            "--output",
            default="CAVE_output_%s" % (datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')),
            help="path to folder in which to save the HTML-report. ")
        cave_opts.add_argument("--seed",
                               default=42,
                               type=int,
                               help="random seed used throughout analysis. ")
        cave_opts.add_argument(
            "--file_format",
            default='auto',
            help="specify the format of the configurator-files. ",
            choices=['auto', 'SMAC2', 'SMAC3', 'CSV', 'BOHB'],
            type=str.upper)
        cave_opts.add_argument("--validation_format",
                               default='NONE',
                               help="what format the validation-files are in",
                               choices=['SMAC2', 'SMAC3', 'CSV', 'NONE'],
                               type=str.upper)
        cave_opts.add_argument(
            "--ta_exec_dir",
            default='.',
            help=
            "path to the execution-directory of the configurator run. this is the path from "
            "which the scenario is loaded, so the instance-/pcs-files specified in the "
            "scenario, so they are relative to this path "
            "(e.g. 'ta_exec_dir/path_to_train_inst_specified_in_scenario.txt'). ",
            nargs='+')
        # PIMP-configs
        pimp_opts = parser.add_argument_group(
            "Parameter Importance",
            "Define the behaviour of the ParameterImportance-module (pimp)")
        pimp_opts.add_argument(
            "--pimp_max_samples",
            default=-1,
            type=int,
            help="How many datapoints to use with PIMP. -1 -> use all. ")
        pimp_opts.add_argument("--pimp_no_fanova_pairs",
                               action="store_false",
                               dest="fanova_pairwise",
                               help="fANOVA won't compute pairwise marginals")
        pimp_opts.add_argument(
            "--pimp_sort_table_by",
            default="average",
            choices=p_sort_by_choices,
            help="raw|what kind of parameter importance method to "
            "use to sort the overview-table. ")

        cfp_opts = parser.add_argument_group(
            "Configurator Footprint", "Finetune the configurator footprint")
        cfp_opts.add_argument(
            "--cfp_time_slider",
            help="whether or not to have a time_slider-widget on cfp-plot"
            "INCREASES FILE-SIZE (and loading) DRAMATICALLY. ",
            choices=["on", "off"],
            default="off")
        cfp_opts.add_argument(
            "--cfp_number_quantiles",
            help=
            "number of quantiles that configurator footprint should plot over time. ",
            default=3,
            type=int)
        cfp_opts.add_argument(
            "--cfp_max_configurations_to_plot",
            help=
            "maximum number of configurations to be plotted in configurator footprint (in case "
            "you run into a MemoryError). -1 -> plot all. ",
            default=-1,
            type=int)

        pc_opts = parser.add_argument_group(
            "Parallel Coordinates",
            "Finetune the parameter parallel coordinates")
        pc_opts.add_argument(
            "--pc_sort_by",
            help=
            "parameter-importance method to determine the order (and selection) of parameters "
            "for parallel coordinates. all: aggregate over all available methods. uses random "
            "method if none is given. ",
            default="all",
            type=str.lower,
            choices=p_choices)

        cot_opts = parser.add_argument_group(
            "Cost Over Time", "Finetune the cost over time plot")
        cot_opts.add_argument(
            "--cot_inc_traj",
            help=
            "if the optimizer belongs to HpBandSter (e.g. bohb), you can choose how the "
            "incumbent-trajectory will be interpreted with regards to the budget. You can "
            "choose from 'racing', which will only accept a configuration of a higher budget "
            "than the current incumbent's if the current incumbent has been evaluated on "
            "the higher budget; 'minimum', which will only look at the current performance "
            "no matter the budget; and 'prefer_higher_budget', which will always choose "
            "a configuration on a higher budget as incumbent as soon as it is available "
            "(this will likely lead to peaks, whenever a new budget is evaluated)",
            default="racing",
            type=str.lower,
            choices=["racing", "minimum", "prefer_higher_budget"])

        # General analysis to be carried out
        act_opts = parser.add_argument_group(
            "Analysis", "Which analysis methods should be carried out")
        act_opts.add_argument(
            "--parameter_importance",
            default="all",
            nargs='+',
            help="raw|what kind of parameter importance method to "
            "use. Choose any combination of\n[" + ', '.join(p_choices[1:-1]) +
            "] or set it to "
            "all/none",
            choices=p_choices,
            type=str.lower)
        act_opts.add_argument(
            "--feature_analysis",
            default="all",
            nargs='+',
            help="raw|what kind of feature analysis methods to use. "
            "Choose any combination of\n[" + ', '.join(f_choices[1:-1]) +
            "] or set it to "
            "all/none",
            choices=f_choices,
            type=str.lower)
        act_opts.add_argument("--no_performance_table",
                              action='store_false',
                              help="don't create performance table.",
                              dest='performance_table')
        act_opts.add_argument("--no_ecdf",
                              action='store_false',
                              help="don't plot ecdf.",
                              dest='ecdf')
        act_opts.add_argument("--no_scatter_plots",
                              action='store_false',
                              help="don't plot scatter plots.",
                              dest='scatter')
        act_opts.add_argument("--no_cost_over_time",
                              action='store_false',
                              help="don't plot cost over time.",
                              dest='cost_over_time')
        act_opts.add_argument("--no_configurator_footprint",
                              action='store_false',
                              help="don't plot configurator footprint.",
                              dest='configurator_footprint')
        act_opts.add_argument("--no_parallel_coordinates",
                              action='store_false',
                              help="don't plot parallel coordinates.",
                              dest='parallel_coordinates')
        act_opts.add_argument("--no_algorithm_footprints",
                              action='store_false',
                              help="don't plot algorithm footprints.",
                              dest='algorithm_footprints')
        act_opts.add_argument("--no_budget_correlation",
                              action='store_false',
                              help="don't plot budget correlation.",
                              dest='budget_correlation')
        act_opts.add_argument("--bohb_learning_curves",
                              action='store_false',
                              help="don't plot bohb learning curves.",
                              dest='bohb_learning_curves')
        act_opts.add_argument("--no_incumbents_over_budgets",
                              action='store_false',
                              help="don't plot incumbents over budgets.",
                              dest='incumbents_over_budgets')

        spe_opts = parser.add_argument_group("Meta arguments")
        spe_opts.add_argument('-v',
                              '--version',
                              action='version',
                              version='%(prog)s ' + str(v),
                              help="show program's version number and exit.")
        spe_opts.add_argument('-h',
                              '--help',
                              action="help",
                              help="show this help message and exit")

        args_ = parser.parse_args(sys.argv[1:])

        # Expand configs
        if "all" in args_.parameter_importance:
            param_imp = ["ablation", "forward_selection", "fanova", "lpi"]
        elif "none" in args_.parameter_importance:
            param_imp = []
        else:
            param_imp = args_.parameter_importance

        if "fanova" in param_imp:
            try:
                import fanova  # noqa
            except ImportError:
                raise ImportError(
                    'fANOVA is not installed! To install it please run '
                    '"git+http://github.com/automl/fanova.git@master"')

        if not (args_.pimp_sort_table_by == "average"
                or args_.pimp_sort_table_by in param_imp):
            raise ValueError("Pimp comparison sorting key is {}, but this "
                             "method is deactivated or non-existent.".format(
                                 args_.pimp_sort_table_by))

        if "all" in args_.feature_analysis:
            feature_analysis = [
                "box_violin", "correlation", "importance", "clustering"
            ]
        elif "none" in args_.feature_analysis:
            feature_analysis = []
        else:
            feature_analysis = args_.feature_analysis

        output_dir = args_.output

        # Configuration results to be analyzed
        folders = []
        for f in args_.folders:
            if '*' in f:
                folders.extend(list(glob.glob(f, recursive=True)))
            else:
                folders.append(f)
        # Default ta_exec_dir is cwd
        ta_exec_dir = []
        for t in args_.ta_exec_dir:
            if '*' in t:
                ta_exec_dir.extend(list(glob.glob(t, recursive=True)))
            else:
                ta_exec_dir.append(t)

        file_format = args_.file_format
        validation_format = args_.validation_format
        validation = args_.validation
        seed = args_.seed
        verbose_level = args_.verbose_level
        show_jupyter = args_.jupyter == 'on'

        analyzing_options = load_default_options(
            file_format=detect_fileformat(folders) if file_format.upper() ==
            "AUTO" else file_format)

        analyzing_options["Ablation"]["run"] = str('ablation' in param_imp)
        analyzing_options["Algorithm Footprint"]["run"] = str(
            args_.algorithm_footprints)
        analyzing_options["Budget Correlation"]["run"] = str(
            args_.budget_correlation)
        analyzing_options["BOHB Learning Curves"]["run"] = str(
            args_.bohb_learning_curves)
        analyzing_options["Configurator Footprint"]["run"] = str(
            args_.configurator_footprint)
        analyzing_options["Configurator Footprint"]["time_slider"] = str(
            args_.cfp_time_slider)
        analyzing_options["Configurator Footprint"]["number_quantiles"] = str(
            args_.cfp_number_quantiles)
        analyzing_options["Configurator Footprint"][
            "max_configurations_to_plot"] = str(
                args_.cfp_max_configurations_to_plot)
        analyzing_options["Cost Over Time"]["run"] = str(args_.cost_over_time)
        analyzing_options["Cost Over Time"]["incumbent_trajectory"] = str(
            args_.cot_inc_traj)
        analyzing_options["empirical Cumulative Distribution Function (eCDF)"][
            "run"] = str(args_.ecdf)
        analyzing_options["fANOVA"]["run"] = str('fanova' in param_imp)
        analyzing_options["fANOVA"]["fanova_pairwise"] = str(
            args_.fanova_pairwise)
        analyzing_options["fANOVA"]["pimp_max_samples"] = str(
            args_.pimp_max_samples)
        analyzing_options["Feature Clustering"]["run"] = str(
            'clustering' in feature_analysis)
        analyzing_options["Feature Correlation"]["run"] = str(
            'correlation' in feature_analysis)
        analyzing_options["Feature Importance"]["run"] = str(
            'importance' in feature_analysis)
        analyzing_options["Forward Selection"]["run"] = str(
            'forward_selection' in param_imp)
        analyzing_options["Importance Table"]["sort_table_by"] = str(
            args_.pimp_sort_table_by)
        analyzing_options["Incumbents Over Budgets"]["run"] = str(
            args_.incumbents_over_budgets)
        analyzing_options["Local Parameter Importance (LPI)"]["run"] = str(
            'lpi' in param_imp)
        analyzing_options["Parallel Coordinates"]["run"] = str(
            args_.parallel_coordinates)
        analyzing_options["Parallel Coordinates"]["pc_sort_by"] = str(
            args_.pc_sort_by)
        analyzing_options["Performance Table"]["run"] = str(
            args_.performance_table)

        cave = CAVE(
            folders,
            output_dir,
            ta_exec_dir,
            file_format=file_format,
            validation_format=validation_format,
            validation_method=validation,
            show_jupyter=show_jupyter,
            seed=seed,
            verbose_level=verbose_level,
            analyzing_options=analyzing_options,
        )

        try:
            cave.logger.debug("CAVE is called with arguments: " + str(args_))
        except AttributeError as err:
            logging.getLogger().warning(
                "Something went wrong with CAVE-initialization... (it's fine for running nosetests)"
            )
            logging.getLogger().debug("CAVE is called with arguments: " +
                                      str(args_))

        # Analyze
        cave.analyze()

Exemplo n.º 2

0

Exibir arquivo

    def main_cli(self):
        """
        Main cli, implementing comparison between and analysis of Configurator-results.
        """
        # Reset logging module (needs to happen before logger initalization)
        logging.shutdown()
        reload(logging)

        # Those are the options for the --only / --skip flags
        map_options = {
            'performance_table': 'Performance Table',
            'ecdf': 'empirical Cumulative Distribution Function (eCDF)',
            'scatter_plot': 'Scatter Plot',
            'cost_over_time': 'Cost Over Time',
            'configurator_footprint': 'Configurator Footprint',
            'parallel_coordinates': 'Parallel Coordinates',
            'algorithm_footprints': 'Algorithm Footprint',
            'budget_correlation': 'Budget Correlation',
            'bohb_learning_curves': 'BOHB Learning Curves',
            'incumbents_over_budgets': 'Incumbents Over Budgets',
            # Parameter Importance:
            'fanova': 'fANOVA',
            'ablation': 'Ablation',
            'lpi': 'Local Parameter Importance (LPI)',
            'local_parameter_importance': 'Local Parameter Importance (LPI)',
            'forward_selection': 'Forward Selection',
            # Feature Importance
            'clustering': "Feature Clustering",
            'correlation': "Feature Correlation",
            'importance': "Feature Importance",
            'box_violin': "Violin and Box Plots",
        }

        parser = ArgumentParser(formatter_class=SmartArgsDefHelpFormatter,
                                add_help=False,
                                description='CAVE: Configuration Assessment Vizualisation and Evaluation')

        req_opts = parser.add_mutually_exclusive_group(required=True)  # Either positional or keyword folders option
        req_opts.add_argument("folders",
                              nargs='*',
                              # strings prefixed with raw| can be manually split with \n
                              help="raw|path(s) to Configurator output-directory/ies",
                              default=SUPPRESS)

        req_opts.add_argument("--folders",
                              nargs='*',
                              dest='folders',
                              default=SUPPRESS,
                              help=SUPPRESS)

        cave_opts = parser.add_argument_group("CAVE global options",
                                              "Options that configure the analysis in general and define behaviour.")
        cave_opts.add_argument("--verbose_level",
                              default="INFO",
                              choices=[
                                  "INFO",
                                  "DEBUG",
                                  "DEV_DEBUG",
                                  "WARNING",
                                  "OFF"
                              ],
                              help="verbose level. use DEV_DEBUG for development to filter boilerplate-logs from "
                                   "imported modules, use DEBUG for full logging. full debug-log always in "
                                   "'output/debug/debug.log' ")
        cave_opts.add_argument("--jupyter",
                               default='off',
                               choices=['on', 'off'],
                               help="output everything to jupyter, if available."
                               )
        cave_opts.add_argument("--validation",
                               default="epm",
                               choices=[
                                   "validation",
                                   "epm "
                               ],
                               help="how to complete missing runs for config/inst-pairs. epm trains random forest with "
                                    "available data to estimate missing runs, validation requires target algorithm. ",
                               type=str.lower)
        cave_opts.add_argument("--output",
                               default="CAVE_output_%s" % (
                                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')),
                               help="path to folder in which to save the HTML-report. ")
        cave_opts.add_argument("--seed",
                               default=42,
                               type=int,
                               help="random seed used throughout analysis. ")
        cave_opts.add_argument("--file_format",
                               default='auto',
                               help="specify the format of the configurator-files. ",
                               choices=['auto', 'SMAC2', 'SMAC3', 'CSV', 'BOHB', 'APT'],
                               type=str.upper)
        cave_opts.add_argument("--validation_format",
                               default='NONE',
                               help="what format the validation-files are in",
                               choices=['SMAC2', 'SMAC3', 'CSV', 'NONE'],
                               type=str.upper)
        cave_opts.add_argument("--ta_exec_dir",
                               default='.',
                               help="path to the execution-directory of the configurator run. this is the path from "
                                    "which the scenario is loaded, so the instance-/pcs-files specified in the "
                                    "scenario, so they are relative to this path "
                                    "(e.g. 'ta_exec_dir/path_to_train_inst_specified_in_scenario.txt'). ",
                               nargs='+')

        # PIMP-configs
        pimp_opts = parser.add_argument_group("Parameter Importance",
                                              "Define the behaviour of the ParameterImportance-module (pimp)")

        pimp_opts.add_argument("--pimp_interactive",
                               choices=["on", "off"],
                               default="on",
                               help="Whether or not to plot interactive bokeh plots for parameter importance analysis")
        pimp_opts.add_argument("--pimp_whiskers",
                               choices=["on", "off"],
                               default="on",
                               help="Whether or not to plot interactive whisker plot for parameter importance analysis")
        pimp_opts.add_argument("--pimp_max_samples",
                               default=-1,
                               type=int,
                               help="How many datapoints to use with PIMP. -1 -> use all. ")
        pimp_opts.add_argument("--pimp_no_fanova_pairs",
                               action="store_false",
                               dest="fanova_pairwise",
                               help="fANOVA won't compute pairwise marginals")

        cfp_opts = parser.add_argument_group("Configurator Footprint", "Fine-tune the configurator footprint")
        cfp_opts.add_argument("--cfp_time_slider",
                              help="whether or not to have a time_slider-widget on cfp-plot"
                                   "INCREASES FILE-SIZE (and loading) DRAMATICALLY. ",
                              choices=["on", "off"],
                              default="off")
        cfp_opts.add_argument("--cfp_number_quantiles",
                              help="number of quantiles that configurator footprint should plot over time. ",
                              default=3, type=int)
        cfp_opts.add_argument("--cfp_max_configurations_to_plot",
                              help="maximum number of configurations to be plotted in configurator footprint (in case "
                                   "you run into a MemoryError). -1 -> plot all. ",
                              default=-1, type=int)

        pc_opts = parser.add_argument_group("Parallel Coordinates", "Fine-tune the parameter parallel coordinates")
        # TODO: this choice should be integrated into the bokeh plot
        pc_opts.add_argument("--pc_sort_by",
                             help="parameter-importance method to determine the order (and selection) of parameters "
                                  "for parallel coordinates. all: aggregate over all available methods. uses random "
                                  "method if none is given. ",
                             default="all", type=str.lower,
                             choices=['fanova', 'lpi', 'ablation', 'forward_selection', 'all'])

        cot_opts = parser.add_argument_group("Cost Over Time", "Fine-tune the cost over time plot")
        cot_opts.add_argument("--cot_inc_traj",
                              help="if the optimizer belongs to HpBandSter (e.g. bohb), you can choose how the "
                                   "incumbent-trajectory will be interpreted with regards to the budget. You can "
                                   "choose from 'racing', which will only accept a configuration of a higher budget "
                                   "than the current incumbent's if the current incumbent has been evaluated on "
                                   "the higher budget; 'minimum', which will only look at the current performance "
                                   "no matter the budget; and 'prefer_higher_budget', which will always choose "
                                   "a configuration on a higher budget as incumbent as soon as it is available "
                                   "(this will likely lead to peaks, whenever a new budget is evaluated)",
                              default="racing", type=str.lower,
                              choices=["racing", "minimum", "prefer_higher_budget"])

        # General analysis to be carried out
        default_opts = parser.add_mutually_exclusive_group()
        default_opts.add_argument("--only",
                                  nargs='*',
                                  help='perform only these analysis methods. choose from: {}'.format(
                                      ", ".join(sorted(map_options.keys()))
                                  ),
                                  default=[],
                                  )
        default_opts.add_argument("--skip",
                                  nargs='*',
                                  help='perform all but these analysis methods. choose from: {}'.format(
                                      ", ".join(sorted(map_options.keys()))
                                  ),
                                  default=[]
                                  )

        spe_opts = parser.add_argument_group("Meta arguments")
        spe_opts.add_argument('-v', '--version', action='version',
                              version='%(prog)s ' + str(v), help="show program's version number and exit.")
        spe_opts.add_argument('-h', '--help', action="help", help="show this help message and exit")

        # Parse arguments and save to args_
        args_ = parser.parse_args(sys.argv[1:])

        # Configuration results to be analyzed
        folders = []
        for f in args_.folders:
            if '*' in f:
                folders.extend(list(glob.glob(f, recursive=True)))
            else:
                folders.append(f)
        # Default ta_exec_dir is cwd
        ta_exec_dir = []
        for t in args_.ta_exec_dir:
            if '*' in t:
                ta_exec_dir.extend(list(glob.glob(t, recursive=True)))
            else:
                ta_exec_dir.append(t)

        output_dir = args_.output
        file_format = args_.file_format
        validation_format = args_.validation_format
        validation = args_.validation
        seed = args_.seed
        verbose_level = args_.verbose_level
        show_jupyter = args_.jupyter == 'on'

        # Load default options for this file_format
        analyzing_options = load_default_options(file_format=detect_fileformat(folders)
                                                 if file_format.upper() == "AUTO" else file_format)

        # Interpret the --skip and --only flags
        if len(args_.only) > 0:
            # Set all to False
            for o in map_options.values():
                analyzing_options[o]["run"] = str(False)
        for o in args_.only if len(args_.only) > 0 else args_.skip:
            if o.lower() not in map_options:
                raise ValueError("Failed to interpret `--[only|skip] {}`.\n"
                                 "Please choose from:\n  {}".format(o, '\n  '.join(sorted(map_options.keys()))))
            # Set True if flag is --only and False if flag is --skip
            analyzing_options[map_options[o.lower()]]["run"] = str(len(args_.only) > 0)

        # Fine-tuning individual analyzer options
        analyzing_options["Configurator Footprint"]["time_slider"] = str(args_.cfp_time_slider)
        analyzing_options["Configurator Footprint"]["number_quantiles"] = str(args_.cfp_number_quantiles)
        analyzing_options["Configurator Footprint"]["max_configurations_to_plot"] = str(args_.cfp_max_configurations_to_plot)
        analyzing_options["Cost Over Time"]["incumbent_trajectory"] = str(args_.cot_inc_traj)
        analyzing_options["fANOVA"]["fanova_pairwise"] = str(args_.fanova_pairwise)
        analyzing_options["fANOVA"]["pimp_max_samples"] = str(args_.pimp_max_samples)
        analyzing_options["Parallel Coordinates"]["pc_sort_by"] = str(args_.pc_sort_by)
        analyzing_options["Parameter Importance"]["whisker_quantiles_plot"] = str(args_.pimp_whiskers)
        analyzing_options["Parameter Importance"]["interactive_bokeh_plots"] = str(args_.pimp_interactive)

        # Initialize CAVE
        cave = CAVE(folders,
                    output_dir,
                    ta_exec_dir,
                    file_format=file_format,
                    validation_format=validation_format,
                    validation_method=validation,
                    show_jupyter=show_jupyter,
                    seed=seed,
                    verbose_level=verbose_level,
                    analyzing_options=analyzing_options,
                    )

        # Check if CAVE was successfully initialized
        try:
            cave.logger.debug("CAVE is called with arguments: " + str(args_))
        except AttributeError as err:
            logging.getLogger().warning("Error in CAVE-initialization... (it's fine for running nosetests)")
            logging.getLogger().debug("CAVE is called with arguments: " + str(args_))

        # Analyze (with options defined in initialization via the analyzing_options
        cave.analyze()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: runs_container.py Projeto: PetarMPetrov/CAVE

    def __init__(self,
                 folders,
                 ta_exec_dirs=None,
                 output_dir=None,
                 file_format=None,
                 validation_format=None,
                 analyzing_options=None,
                 ):
        """
        Reads in optimizer runs. Converts data if necessary.
        There will be `(n_budgets +1) * (m_parallel_execution + 1)` ConfiguratorRuns in CAVE, each representing the data
        of a specific budget-parallel-execution combination or an aggregated version..

        Aggregated entries can be accessed via a None-key.

        pr: parallel run, b: budget, agg: aggregated

        +----------+------+-----+------+-----------+
        |          | pr_1 | ... | pr_m | agg (None)|
        +==========================================+
        |b_1       |      |     |      |           +
        +----------+------+-----+------+-----------+
        |...       |      |     |      |           +
        +----------+------+-----+------+-----------+
        |b_2       |      |     |      |           +
        +----------+------+-----+------+-----------+
        |agg (None)|      |     |      |           +
        +----------+------+-----+------+-----------+

        The data is organized in folder2budgets as {pr : {b : path}} and in pRun2budget as {pr : {b : ConfiguratorRun}}.

        In the internal data-management there are three types of runhistories: *original*, *validated* and *epm*.

        * *original_rh* contain only runs that have been gathered during the optimization-process.
        * *validated_rh* may contain original runs, but also data that was not gathered iteratively during the
          optimization, but systematically through external validation of interesting configurations.
          Important: NO ESTIMATED RUNS IN `validated` RUNHISTORIES!
        * *epm_rh* contain runs that are gathered through empirical performance models.

        Runhistories are organized as follows:

        * each ConfiguratorRun has an *original_runhistory*- and a *combined_runhistory*-attribute
        * if available, each ConfiguratorRun's *validated_runhistory* contains
          a runhistory with validation-data gathered after the optimization
        * *combined_runhistory* always contains as many real runs as possible


        Parameters
        ----------
        folders: List[str]
            list of folders to read in
        ta_exec_dirs: List[str]
            optional, list of execution directories for target-algorithms (to find filepaths, etc.). If you're not sure,
            just set to current working directory (which it is by default).
        output_dir: str
            directory for output (temporary directory if not set)
        file_format: str
            from [SMAC2, SMAC3, BOHB, CSV] defines what file-format the optimizer result is in.
        validation_format: str
            from [SMAC2, SMAC3, BOHB, CSV] defines what file-format validation data is in.
        """
        ##########################################################################################
        #  Initialize and find suitable parameters                                               #
        ##########################################################################################
        self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__)

        self.folders = folders
        self.ta_exec_dirs = ta_exec_dirs if ta_exec_dirs else ['.' for f in folders]
        # Fix wrong input to ta_exec_dir
        if len(self.folders) < len(self.ta_exec_dirs):
            raise ValueError("Too many ta_exec_dirs ({}) compared to the number of folders ({})".format(
                                len(self.ta_exec_dirs), len(self.folders)))
        elif len(self.folders) > len(self.ta_exec_dirs):
            self.logger.info("Assuming ta_exec_dir is valid for all folders, expanding list")
            self.ta_exec_dirs.extend([self.ta_exec_dirs[0] for _ in range(len(self.folders) - len(self.ta_exec_dirs))])

        self.output_dir = output_dir if output_dir else tempfile.mkdtemp()

        if file_format.upper() == "AUTO":
            file_format = detect_fileformat(folders=self.folders)
        self.file_format = file_format
        self.validation_format = validation_format
        self.use_budgets = self.file_format == "BOHB"

        self.analyzing_options = load_default_options(analyzing_options, file_format)

        # Main focus on this mapping pRun2budget2data:
        self.pRun2budget = {None : {}}  # mapping parallel runs to their budgets
        self.runs_list = []  # Just put all ConfiguratorRun-objects here

        ##########################################################################################
        #  Convert if necessary, determine what folders and what budgets                         #
        ##########################################################################################
        # Both budgets and folders have "None" in the key-list for the aggregation over all available budgets/folders
        self.budgets = [None]
        if self.file_format == 'BOHB':
            self.logger.debug("Converting %d BOHB folders to SMAC-format", len(folders))
            hpbandster2smac = HpBandSter2SMAC()
            # Convert m BOHB-folders to m + n SMAC-folders
            # TODO make compatible with hpbandster
            self.folder2result, self.folder2budgets = hpbandster2smac.convert(self.folders, self.output_dir)
            self.budgets.extend(list(self.folder2result.values())[0].HB_config['budgets'])
            #if "DEBUG" in self.verbose_level:
            #    for f in folders:
            #        debug_f = os.path.join(output_dir, 'debug', os.path.basename(f))
            #        shutil.rmtree(debug_f, ignore_errors=True)
            #        shutil.copytree(f, debug_f)
        else:
            self.folder2budgets = {f : {None : f} for f in self.folders}

        ##########################################################################################
        #  Read in folders, where folders are parallel runs and for each parallel-run/budget     #
        #  combination there is one ConfiguratorRun-object (they can be easily aggregated)       #
        ##########################################################################################
        self.logger.debug("Reading in folders: %s with ta_exec_dirs: %s", str(folders), str(self.ta_exec_dirs))
        for f, ta_exec_dir in zip(self.folders, self.ta_exec_dirs):  # Iterating over parallel runs
            self.logger.debug("--Processing folder \"{}\" (and ta_exec_dir \"{}\")".format(f, ta_exec_dir))
            self.pRun2budget[f] = {}
            for b, path in self.folder2budgets[f].items():
                self.logger.debug("----Processing budget \"{}\" (and path: \"{}\")".format(b, path))
                # Using folder of (converted) data here
                try:
                    cr = ConfiguratorRun.from_folder(path,
                                                     ta_exec_dir,
                                                     self.analyzing_options,
                                                     self.file_format,
                                                     self.validation_format,
                                                     b,
                                                     self.output_dir)
                except Exception as err:
                    self.logger.warning("Folder %s could with ta_exec_dir %s not be loaded, failed with error message: %s",
                                        f, ta_exec_dir, err)
                    self.logger.exception(err)
                    continue
                self.pRun2budget[f][b] = cr
                self.runs_list.append(cr)

        self.folders.append(None)
        self.logger.debug("folder2budgets: " + str(self.folder2budgets))
        self.logger.debug("pRun2budget: " + str(self.pRun2budget))

        self.scenario = self.runs_list[0].scenario

        if not self.get_all_runs():
            raise ValueError("None of the specified folders could be loaded.")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: runs_container.py Projeto: vishalbelsare/CAVE

    def __init__(
        self,
        folders,
        ta_exec_dirs=None,
        output_dir=None,
        file_format=None,
        validation_format=None,
        analyzing_options=None,
    ):
        """
        Reads in optimizer runs. Converts data if necessary.

        SMAC3's RunHistory supports budgets from 0.12.0, so this container will by default keep one ConfiguratorRun per
        folder (assuming folders are parallel runs). Budgets are integrated in RunHistories per conversion.
        The RunHistory object provides an easy way to aggregate over parallel runs or budgets.

        The data is organized in self.data as {folder_name : ConfiguratorRun}.
        Aggregated or reduced ConfiguratorRuns are cached by their identifier (needs to be unique from context!)
          in self.cache as {identifier : ConfiguratorRun},-

        In the internal data-management there are three types of runhistories: *original*, *validated* and *epm*.
        They are saved in and provided by the ConfiguratorRuns

        * *original_rh* contain only runs that have been gathered during the optimization-process.
        * *validated_rh* may contain original runs, but also data that was not gathered iteratively during the
          optimization, but systematically through external validation of interesting configurations.
          Important: NO ESTIMATED RUNS IN `validated` RUNHISTORIES!
        * *epm_rh* contain runs that are gathered through empirical performance models.

        Runhistories are organized as follows:

        * each ConfiguratorRun has an *original_runhistory*- and a *combined_runhistory*-attribute
        * if available, each ConfiguratorRun's *validated_runhistory* contains
          a runhistory with validation-data gathered after the optimization
        * *combined_runhistory* always contains as many real runs as possible


        Parameters
        ----------
        folders: List[str]
            list of folders to read in
        ta_exec_dirs: List[str]
            optional, list of execution directories for target-algorithms (to find filepaths, etc.). If you're not sure,
            just set to current working directory (which it is by default).
        output_dir: str
            directory for output (temporary directory if not set)
        file_format: str
            optional, from [AUTO, SMAC2, SMAC3, BOHB, CSV] defines what file-format the optimizer result is in.
            AUTO or None will lead to attempted automatic detection
        validation_format: str
            from [SMAC2, SMAC3, BOHB, CSV] defines what file-format validation data is in.
        analyzing_options: dict / ConfigParser
            contains important global configurations on how to run CAVE, see
            `options <https://github.com/automl/CAVE/blob/master/cave/utils/options/default_analysis_options.ini>`_
        """
        ################################################################################################################
        #  Initialize and find suitable parameters                                                                     #
        ################################################################################################################
        self.logger = logging.getLogger(self.__module__ + '.' +
                                        self.__class__.__name__)

        self.folders = folders
        ta_exec_dirs = ta_exec_dirs if ta_exec_dirs else ['.']
        self.ta_exec_dirs = [ta_exec_dirs[0] for _ in range(len(folders))
                             ] if len(ta_exec_dirs) == 1 else ta_exec_dirs
        # Fix wrong input to ta_exec_dir
        if len(self.folders) < len(self.ta_exec_dirs):
            raise ValueError(
                "ta_exec_dirs (# {}) compared to the number of folders ({})".
                format(len(self.ta_exec_dirs), len(self.folders)))

        self.output_dir = output_dir if output_dir else tempfile.mkdtemp()

        if file_format.upper() == "AUTO" or file_format is None:
            file_format = detect_fileformat(folders=self.folders)
            self.logger.info("Format of input detected automatically: %s",
                             file_format)
        self.file_format = file_format
        self.validation_format = validation_format

        self.analyzing_options = load_default_options(analyzing_options,
                                                      file_format)

        # Main focus on this mapping pRun2budget2data:
        self.data = OrderedDict()  # mapping parallel runs to their budgets
        self.cache = OrderedDict()  # Reuse already generated ConfiguratorRuns

        ################################################################################################################
        #  Convert if necessary, determine what folders and what budgets                                               #
        ################################################################################################################
        input_data = {f: {} for f in self.folders}
        converters = {
            'BOHB': HpBandSter2SMAC,
            'CSV': CSV2SMAC,
            'APT': APT2SMAC,
        }
        if self.file_format in converters:
            self.logger.debug("Converting %d %s folders to SMAC-format",
                              len(folders), self.file_format)
            converter = converters[self.file_format]()
            input_data = converter.convert(
                self.folders,
                ta_exec_dirs=self.ta_exec_dirs,
                output_dir=self.output_dir,
            )
            # Also setting ta_exec_dirs to cwd, since we are now using the converted paths...
            self.ta_exec_dirs = ['.' for _ in range(len(self.folders))]

        ################################################################################################################
        #  Read in folders, where folders are parallel runs and for each parallel-run                                  #
        #  there is one ConfiguratorRun-object (they can be easily aggregated)                                         #
        ################################################################################################################
        self.logger.debug("Reading in folders: %s with ta_exec_dirs: %s",
                          str(self.folders), str(self.ta_exec_dirs))
        for f, ta_exec_dir in zip(
                self.folders,
                self.ta_exec_dirs):  # Iterating over parallel runs
            self.logger.debug(
                "--Processing folder \"{}\" (and ta_exec_dir \"{}\")".format(
                    f, ta_exec_dir))

            if all([
                    x in input_data[f] for x in [
                        'new_path', 'config_space', 'runhistory', 'scenario',
                        'trajectory'
                    ]
            ]):
                # Data has been converted and should therefore be available here
                self.logger.debug('Input data already read in for folder %s',
                                  f)
                self.logger.debug(
                    list(input_data[f]['runhistory'].data.items())[:10])
                cr = ConfiguratorRun(
                    scenario=input_data[f].pop('scenario'),
                    original_runhistory=input_data[f].pop('runhistory'),
                    validated_runhistory=input_data[f].pop(
                        'validated_runhistory', None),
                    trajectory=input_data[f].pop('trajectory'),
                    options=self.analyzing_options,
                    path_to_folder=input_data[f].pop('new_path'),
                    ta_exec_dir=ta_exec_dir,
                    file_format=file_format,
                    validation_format=validation_format,
                    output_dir=self.output_dir)
                # Any format-specific information
                for k, v in input_data[f].items():
                    cr.share_information[k] = v
            else:
                # Data is in good readable SMAC3-format
                cr = ConfiguratorRun.from_folder(
                    f,
                    ta_exec_dir,
                    self.analyzing_options,
                    file_format=self.file_format,
                    validation_format=self.validation_format,
                    output_dir=self.output_dir)
            self.data[f] = cr
        self.scenario = list(self.data.values())[0].scenario