示例#1
0
文件: service.py 项目: kubeflow/katib
    def GetSuggestions(self, request, context):
        if self.is_first_run:
            nas_config = request.experiment.spec.nas_config
            num_layers = str(nas_config.graph_config.num_layers)

            search_space = get_search_space(nas_config.operations)

            settings_raw = request.experiment.spec.algorithm.algorithm_settings
            algorithm_settings = get_algorithm_settings(settings_raw)

            search_space_json = json.dumps(search_space)
            algorithm_settings_json = json.dumps(algorithm_settings)

            search_space_str = str(search_space_json).replace('\"', '\'')
            algorithm_settings_str = str(algorithm_settings_json).replace(
                '\"', '\'')

            self.is_first_run = False

        parameter_assignments = []
        for i in range(request.current_request_number):

            self.logger.info(">>> Generate new Darts Trial Job")

            self.logger.info(">>> Number of layers {}\n".format(num_layers))

            self.logger.info(">>> Search Space")
            self.logger.info("{}\n".format(search_space_str))

            self.logger.info(">>> Algorithm Settings")
            self.logger.info("{}\n\n".format(algorithm_settings_str))

            parameter_assignments.append(
                api_pb2.GetSuggestionsReply.ParameterAssignments(assignments=[
                    api_pb2.ParameterAssignment(name="algorithm-settings",
                                                value=algorithm_settings_str),
                    api_pb2.ParameterAssignment(name="search-space",
                                                value=search_space_str),
                    api_pb2.ParameterAssignment(name="num-layers",
                                                value=num_layers)
                ]))

        return api_pb2.GetSuggestionsReply(
            parameter_assignments=parameter_assignments)
示例#2
0
文件: trial.py 项目: mesosphere/katib
 def generate(list_of_assignments):
     res = []
     for assignments in list_of_assignments:
         buf = []
         for assignment in assignments:
             buf.append(
                 api.ParameterAssignment(name=assignment.name, value=str(assignment.value)))
         rt = api.GetSuggestionsReply.ParameterAssignments(
             assignments=buf)
         res.append(rt)
     return res
示例#3
0
    def test_get_suggestion(self):
        trials = [
            api_pb2.Trial(
                name="test-asfjh",
                spec=api_pb2.TrialSpec(objective=api_pb2.ObjectiveSpec(
                    type=api_pb2.MAXIMIZE,
                    objective_metric_name="metric-2",
                    goal=0.9),
                                       parameter_assignments=api_pb2.TrialSpec.
                                       ParameterAssignments(assignments=[
                                           api_pb2.ParameterAssignment(
                                               name="param-1",
                                               value="2",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-2",
                                               value="cat1",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-3",
                                               value="2",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-4",
                                               value="3.44",
                                           )
                                       ])),
                status=api_pb2.TrialStatus(observation=api_pb2.Observation(
                    metrics=[
                        api_pb2.Metric(name="metric=1", value="435"),
                        api_pb2.Metric(name="metric=2", value="5643"),
                    ]))),
            api_pb2.Trial(
                name="test-234hs",
                spec=api_pb2.TrialSpec(objective=api_pb2.ObjectiveSpec(
                    type=api_pb2.MAXIMIZE,
                    objective_metric_name="metric-2",
                    goal=0.9),
                                       parameter_assignments=api_pb2.TrialSpec.
                                       ParameterAssignments(assignments=[
                                           api_pb2.ParameterAssignment(
                                               name="param-1",
                                               value="3",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-2",
                                               value="cat2",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-3",
                                               value="6",
                                           ),
                                           api_pb2.ParameterAssignment(
                                               name="param-4",
                                               value="4.44",
                                           )
                                       ])),
                status=api_pb2.TrialStatus(observation=api_pb2.Observation(
                    metrics=[
                        api_pb2.Metric(name="metric=1", value="123"),
                        api_pb2.Metric(name="metric=2", value="3028"),
                    ])))
        ]
        experiment = api_pb2.Experiment(
            name="test",
            spec=api_pb2.ExperimentSpec(
                algorithm=api_pb2.AlgorithmSpec(
                    algorithm_name="tpe",
                    algorithm_settings=[
                        api_pb2.AlgorithmSetting(name="random_state",
                                                 value="10"),
                        api_pb2.AlgorithmSetting(name="gamma", value="0.25"),
                        api_pb2.AlgorithmSetting(name="prior_weight",
                                                 value="1.0"),
                        api_pb2.AlgorithmSetting(name="n_EI_candidates",
                                                 value="24"),
                    ],
                ),
                objective=api_pb2.ObjectiveSpec(type=api_pb2.MAXIMIZE,
                                                goal=0.9),
                parameter_specs=api_pb2.ExperimentSpec.
                ParameterSpecs(parameters=[
                    api_pb2.ParameterSpec(
                        name="param-1",
                        parameter_type=api_pb2.INT,
                        feasible_space=api_pb2.FeasibleSpace(
                            max="5", min="1", list=[]),
                    ),
                    api_pb2.ParameterSpec(name="param-2",
                                          parameter_type=api_pb2.CATEGORICAL,
                                          feasible_space=api_pb2.FeasibleSpace(
                                              max=None,
                                              min=None,
                                              list=["cat1", "cat2", "cat3"])),
                    api_pb2.ParameterSpec(
                        name="param-3",
                        parameter_type=api_pb2.DISCRETE,
                        feasible_space=api_pb2.FeasibleSpace(
                            max=None, min=None, list=["3", "2", "6"])),
                    api_pb2.ParameterSpec(name="param-4",
                                          parameter_type=api_pb2.DOUBLE,
                                          feasible_space=api_pb2.FeasibleSpace(
                                              max="5", min="1", list=[]))
                ])))

        request = api_pb2.GetSuggestionsRequest(
            experiment=experiment,
            trials=trials,
            request_number=2,
        )

        get_suggestion = self.test_server.invoke_unary_unary(
            method_descriptor=(
                api_pb2.DESCRIPTOR.services_by_name['Suggestion'].
                methods_by_name['GetSuggestions']),
            invocation_metadata={},
            request=request,
            timeout=1)

        response, metadata, code, details = get_suggestion.termination()
        print(response.parameter_assignments)
        self.assertEqual(code, grpc.StatusCode.OK)
        self.assertEqual(2, len(response.parameter_assignments))
示例#4
0
文件: service.py 项目: kubeflow/katib
    def GetSuggestions(self, request, context):
        if self.is_first_run:
            self.experiment = EnasExperiment(request, self.logger)
        experiment = self.experiment
        if request.current_request_number > 0:
            experiment.num_trials = request.current_request_number
        self.logger.info(
            "-" * 100 + "\nSuggestion Step {} for Experiment {}\n".format(
                experiment.suggestion_step, experiment.experiment_name) +
            "-" * 100)

        self.logger.info("")
        self.logger.info(">>> Current Request Number:\t\t{}".format(
            experiment.num_trials))
        self.logger.info("")

        with experiment.tf_graph.as_default():
            saver = tf.compat.v1.train.Saver()
            ctrl = experiment.controller

            controller_ops = {
                "loss": ctrl.loss,
                "entropy": ctrl.sample_entropy,
                "grad_norm": ctrl.grad_norm,
                "baseline": ctrl.baseline,
                "skip_rate": ctrl.skip_rate,
                "train_op": ctrl.train_op,
                "train_step": ctrl.train_step,
                "sample_arc": ctrl.sample_arc,
                "child_val_accuracy": ctrl.child_val_accuracy,
            }

            if self.is_first_run:
                self.logger.info(
                    ">>> First time running suggestion for {}. Random architecture will be given."
                    .format(experiment.experiment_name))
                with tf.compat.v1.Session() as sess:
                    sess.run(tf.compat.v1.global_variables_initializer())
                    candidates = list()
                    for _ in range(experiment.num_trials):
                        candidates.append(
                            sess.run(controller_ops["sample_arc"]))

                    # TODO: will use PVC to store the checkpoint to protect against unexpected suggestion pod restart
                    saver.save(sess, experiment.ctrl_cache_file)

                self.is_first_run = False

            else:
                with tf.compat.v1.Session() as sess:
                    saver.restore(sess, experiment.ctrl_cache_file)

                    result = self.GetEvaluationResult(request.trials)

                    # TODO: (andreyvelich) I deleted this part, should it be handle by controller?
                    # Sometimes training container may fail and GetEvaluationResult() will return None
                    # In this case, the Suggestion will:
                    # 1. Firstly try to respawn the previous trials after waiting for RESPAWN_SLEEP seconds
                    # 2. If respawning the trials for RESPAWN_LIMIT times still cannot collect valid results,
                    #    then fail the task because it may indicate that the training container has errors.
                    if result is None:
                        self.logger.warning(
                            ">>> Suggestion has spawned trials, but they all failed."
                        )
                        self.logger.warning(
                            ">>> Please check whether the training container is correctly implemented"
                        )
                        self.logger.info(">>> Experiment {} failed".format(
                            experiment.experiment_name))
                        return []

                    # This LSTM network is designed to maximize the metrics
                    # However, if the user wants to minimize the metrics, we can take the negative of the result

                    if experiment.opt_direction == api_pb2.MINIMIZE:
                        result = -result

                    self.logger.info(
                        ">>> Suggestion updated. LSTM Controller Training\n")
                    log_every = experiment.algorithm_settings[
                        "controller_log_every_steps"]
                    for ctrl_step in range(
                            1, experiment.
                            algorithm_settings["controller_train_steps"] + 1):
                        run_ops = [
                            controller_ops["loss"], controller_ops["entropy"],
                            controller_ops["grad_norm"],
                            controller_ops["baseline"],
                            controller_ops["skip_rate"],
                            controller_ops["train_op"]
                        ]

                        loss, entropy, grad_norm, baseline, skip_rate, _ = sess.run(
                            fetches=run_ops,
                            feed_dict={
                                controller_ops["child_val_accuracy"]: result
                            })

                        controller_step = sess.run(
                            controller_ops["train_step"])
                        if ctrl_step % log_every == 0:
                            log_string = ""
                            log_string += "Controller Step: {} - ".format(
                                controller_step)
                            log_string += "Loss: {:.4f} - ".format(loss)
                            log_string += "Entropy: {:.9} - ".format(entropy)
                            log_string += "Gradient Norm: {:.7f} - ".format(
                                grad_norm)
                            log_string += "Baseline={:.4f} - ".format(baseline)
                            log_string += "Skip Rate={:.4f}".format(skip_rate)
                            self.logger.info(log_string)

                    candidates = list()
                    for _ in range(experiment.num_trials):
                        candidates.append(
                            sess.run(controller_ops["sample_arc"]))

                    saver.save(sess, experiment.ctrl_cache_file)

        organized_candidates = list()
        parameter_assignments = list()

        for i in range(experiment.num_trials):
            arc = candidates[i].tolist()
            organized_arc = [0 for _ in range(experiment.num_layers)]
            record = 0
            for layer in range(experiment.num_layers):
                organized_arc[layer] = arc[record:record + layer + 1]
                record += layer + 1
            organized_candidates.append(organized_arc)

            nn_config = dict()
            nn_config['num_layers'] = experiment.num_layers
            nn_config['input_sizes'] = experiment.input_sizes
            nn_config['output_sizes'] = experiment.output_sizes
            nn_config['embedding'] = dict()
            for layer in range(experiment.num_layers):
                opt = organized_arc[layer][0]
                nn_config['embedding'][opt] = experiment.search_space[
                    opt].get_dict()

            organized_arc_json = json.dumps(organized_arc)
            nn_config_json = json.dumps(nn_config)

            organized_arc_str = str(organized_arc_json).replace('\"', '\'')
            nn_config_str = str(nn_config_json).replace('\"', '\'')

            self.logger.info(
                "\n>>> New Neural Network Architecture Candidate #{} (internal representation):"
                .format(i))
            self.logger.info(organized_arc_json)
            self.logger.info("\n>>> Corresponding Seach Space Description:")
            self.logger.info(nn_config_str)

            parameter_assignments.append(
                api_pb2.GetSuggestionsReply.ParameterAssignments(assignments=[
                    api_pb2.ParameterAssignment(name="architecture",
                                                value=organized_arc_str),
                    api_pb2.ParameterAssignment(name="nn_config",
                                                value=nn_config_str)
                ]))

        self.logger.info("")
        self.logger.info(">>> {} Trials were created for Experiment {}".format(
            experiment.num_trials, experiment.experiment_name))
        self.logger.info("")

        experiment.suggestion_step += 1

        return api_pb2.GetSuggestionsReply(
            parameter_assignments=parameter_assignments)
示例#5
0
    def test_get_suggestion(self):
        trials = [
            api_pb2.Trial(
                name="first-trial",
                spec=api_pb2.TrialSpec(
                    objective=api_pb2.ObjectiveSpec(
                        type=api_pb2.MAXIMIZE,
                        objective_metric_name="Validation-Accuracy",
                        goal=0.99),
                    parameter_assignments=api_pb2.TrialSpec.
                    ParameterAssignments(assignments=[
                        api_pb2.ParameterAssignment(
                            name="architecture",
                            value="[[3], [0, 1], [0, 0, 1], [2, 1, 0, 0]]",
                        ),
                        api_pb2.ParameterAssignment(
                            name="nn_config",
                            value="{'num_layers': 4}",
                        ),
                    ])),
                status=api_pb2.TrialStatus(
                    observation=api_pb2.Observation(metrics=[
                        api_pb2.Metric(name="Validation-Accuracy",
                                       value="0.88"),
                    ]),
                    condition=api_pb2.TrialStatus.TrialConditionType.SUCCEEDED,
                )),
            api_pb2.Trial(
                name="second-trial",
                spec=api_pb2.TrialSpec(
                    objective=api_pb2.ObjectiveSpec(
                        type=api_pb2.MAXIMIZE,
                        objective_metric_name="Validation-Accuracy",
                        goal=0.99),
                    parameter_assignments=api_pb2.TrialSpec.
                    ParameterAssignments(assignments=[
                        api_pb2.ParameterAssignment(
                            name="architecture",
                            value="[[1], [0, 1], [2, 1, 1], [2, 1, 1, 0]]",
                        ),
                        api_pb2.ParameterAssignment(
                            name="nn_config",
                            value="{'num_layers': 4}",
                        ),
                    ], )),
                status=api_pb2.TrialStatus(
                    observation=api_pb2.Observation(metrics=[
                        api_pb2.Metric(name="Validation-Accuracy",
                                       value="0.84"),
                    ]),
                    condition=api_pb2.TrialStatus.TrialConditionType.SUCCEEDED,
                ))
        ]
        experiment = api_pb2.Experiment(
            name="enas-experiment",
            spec=api_pb2.ExperimentSpec(
                algorithm=api_pb2.AlgorithmSpec(algorithm_name="enas", ),
                objective=api_pb2.ObjectiveSpec(
                    type=api_pb2.MAXIMIZE,
                    goal=0.9,
                    objective_metric_name="Validation-Accuracy"),
                parallel_trial_count=2,
                max_trial_count=10,
                nas_config=api_pb2.NasConfig(
                    graph_config=api_pb2.GraphConfig(num_layers=4,
                                                     input_sizes=[32, 32, 8],
                                                     output_sizes=[10]),
                    operations=api_pb2.NasConfig.Operations(operation=[
                        api_pb2.Operation(
                            operation_type="convolution",
                            parameter_specs=api_pb2.Operation.
                            ParameterSpecs(parameters=[
                                api_pb2.ParameterSpec(
                                    name="filter_size",
                                    parameter_type=api_pb2.CATEGORICAL,
                                    feasible_space=api_pb2.FeasibleSpace(
                                        max=None, min=None, list=["5"])),
                                api_pb2.ParameterSpec(
                                    name="num_filter",
                                    parameter_type=api_pb2.CATEGORICAL,
                                    feasible_space=api_pb2.FeasibleSpace(
                                        max=None, min=None, list=["128"])),
                                api_pb2.ParameterSpec(
                                    name="stride",
                                    parameter_type=api_pb2.CATEGORICAL,
                                    feasible_space=api_pb2.FeasibleSpace(
                                        max=None, min=None, list=["1", "2"])),
                            ])),
                        api_pb2.Operation(
                            operation_type="reduction",
                            parameter_specs=api_pb2.Operation.
                            ParameterSpecs(parameters=[
                                api_pb2.ParameterSpec(
                                    name="reduction_type",
                                    parameter_type=api_pb2.CATEGORICAL,
                                    feasible_space=api_pb2.FeasibleSpace(
                                        max=None,
                                        min=None,
                                        list=["max_pooling"])),
                                api_pb2.ParameterSpec(
                                    name="pool_size",
                                    parameter_type=api_pb2.INT,
                                    feasible_space=api_pb2.FeasibleSpace(
                                        min="2", max="3", step="1", list=[])),
                            ])),
                    ], ))))

        request = api_pb2.GetSuggestionsRequest(
            experiment=experiment,
            trials=trials,
            request_number=2,
        )

        get_suggestion = self.test_server.invoke_unary_unary(
            method_descriptor=(
                api_pb2.DESCRIPTOR.services_by_name['Suggestion'].
                methods_by_name['GetSuggestions']),
            invocation_metadata={},
            request=request,
            timeout=100)

        response, metadata, code, details = get_suggestion.termination()
        print(response.parameter_assignments)
        self.assertEqual(code, grpc.StatusCode.OK)
        self.assertEqual(2, len(response.parameter_assignments))