Пример #1
0
    def test_parallelism_arg(self):
        default_parallelism = 2

        # Test requested_parallelism is None or negative values.
        for requested_parallelism in [None, -1]:
            with patch_logger("hyperopt-spark") as output:
                parallelism = SparkTrials._decide_parallelism(
                    requested_parallelism=requested_parallelism,
                    spark_default_parallelism=default_parallelism,
                )
                self.assertEqual(
                    parallelism,
                    default_parallelism,
                    "Failed to set parallelism to be default parallelism ({p})"
                    " ({e})".format(p=parallelism, e=default_parallelism),
                )
                log_output = output.getvalue().strip()
                self.assertIn(
                    "Because the requested parallelism was None or a non-positive value, "
                    "parallelism will be set to ({d})".format(d=default_parallelism),
                    log_output,
                    """set to default parallelism missing from log: {log_output}""".format(
                        log_output=log_output
                    ),
                )

        # Test requested_parallelism exceeds hard cap
        with patch_logger("hyperopt-spark") as output:
            parallelism = SparkTrials._decide_parallelism(
                requested_parallelism=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED + 1,
                spark_default_parallelism=default_parallelism,
            )
            self.assertEqual(
                parallelism,
                SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED,
                "Failed to limit parallelism ({p}) to MAX_CONCURRENT_JOBS_ALLOWED ({e})".format(
                    p=parallelism, e=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED
                ),
            )
            log_output = output.getvalue().strip()
            self.assertIn(
                "SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED ({c})".format(
                    c=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED
                ),
                log_output,
                """MAX_CONCURRENT_JOBS_ALLOWED value missing from log: {log_output}""".format(
                    log_output=log_output
                ),
            )
Пример #2
0
    def test_parallelism_arg(self):
        # Computing max_num_concurrent_tasks
        max_num_concurrent_tasks = self.sc._jsc.sc().maxNumConcurrentTasks()
        self.assertEqual(
            max_num_concurrent_tasks,
            BaseSparkContext.NUM_SPARK_EXECUTORS,
            "max_num_concurrent_tasks ({c}) did not equal "
            "BaseSparkContext.NUM_SPARK_EXECUTORS ({e})".format(
                c=max_num_concurrent_tasks,
                e=BaseSparkContext.NUM_SPARK_EXECUTORS),
        )

        for spark_default_parallelism, max_num_concurrent_tasks in [(2, 4),
                                                                    (2, 0)]:
            default_parallelism = max(spark_default_parallelism,
                                      max_num_concurrent_tasks)

            # Test requested_parallelism is None or negative values.
            for requested_parallelism in [None, -1]:
                with patch_logger("hyperopt-spark") as output:
                    parallelism = SparkTrials._decide_parallelism(
                        requested_parallelism=requested_parallelism,
                        spark_default_parallelism=spark_default_parallelism,
                        max_num_concurrent_tasks=max_num_concurrent_tasks,
                    )
                    self.assertEqual(
                        parallelism,
                        default_parallelism,
                        "Failed to set parallelism to be default parallelism ({p})"
                        " ({e})".format(p=parallelism, e=default_parallelism),
                    )
                    log_output = output.getvalue().strip()
                    self.assertIn(
                        "Because the requested parallelism was None or a non-positive value, "
                        "parallelism will be set to ({d})".format(
                            d=default_parallelism),
                        log_output,
                        """set to default parallelism missing from log: {log_output}"""
                        .format(log_output=log_output),
                    )

            # Test requested_parallelism which will trigger spark executor dynamic allocation.
            with patch_logger("hyperopt-spark") as output:
                parallelism = SparkTrials._decide_parallelism(
                    requested_parallelism=max_num_concurrent_tasks + 1,
                    spark_default_parallelism=spark_default_parallelism,
                    max_num_concurrent_tasks=max_num_concurrent_tasks,
                )
                self.assertEqual(
                    parallelism,
                    max_num_concurrent_tasks + 1,
                    "Expect parallelism to be ({e}) but get ({p})".format(
                        p=parallelism, e=max_num_concurrent_tasks + 1),
                )
                log_output = output.getvalue().strip()
                self.assertIn(
                    "Parallelism ({p}) is greater".format(
                        p=max_num_concurrent_tasks + 1),
                    log_output,
                    """Parallelism ({p}) missing from log: {log_output}""".
                    format(p=max_num_concurrent_tasks + 1,
                           log_output=log_output),
                )

            # Test requested_parallelism exceeds hard cap
            with patch_logger("hyperopt-spark") as output:
                parallelism = SparkTrials._decide_parallelism(
                    requested_parallelism=SparkTrials.
                    MAX_CONCURRENT_JOBS_ALLOWED + 1,
                    spark_default_parallelism=spark_default_parallelism,
                    max_num_concurrent_tasks=max_num_concurrent_tasks,
                )
                self.assertEqual(
                    parallelism,
                    SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED,
                    "Failed to limit parallelism ({p}) to MAX_CONCURRENT_JOBS_ALLOWED ({e})"
                    .format(p=parallelism,
                            e=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED),
                )
                log_output = output.getvalue().strip()
                self.assertIn(
                    "SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED ({c})".format(
                        c=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED),
                    log_output,
                    """MAX_CONCURRENT_JOBS_ALLOWED value missing from log: {log_output}"""
                    .format(log_output=log_output),
                )
Пример #3
0
    def test_parallelism_arg(self):
        # Computing max_num_concurrent_tasks
        max_num_concurrent_tasks = self.sc._jsc.sc().maxNumConcurrentTasks()
        self.assertEqual(
            max_num_concurrent_tasks,
            BaseSparkContext.NUM_SPARK_EXECUTORS,
            "max_num_concurrent_tasks ({c}) did not equal "
            "BaseSparkContext.NUM_SPARK_EXECUTORS ({e})".format(
                c=max_num_concurrent_tasks, e=BaseSparkContext.NUM_SPARK_EXECUTORS
            ),
        )

        max_num_concurrent_tasks = 4
        # Given invalidly small parallelism
        with patch_logger("hyperopt-spark") as output:
            parallelism = SparkTrials._decide_parallelism(max_num_concurrent_tasks, -1)
            self.assertEqual(
                parallelism,
                max_num_concurrent_tasks,
                "Failed to default parallelism ({p}) to max_num_concurrent_tasks"
                " ({e})".format(p=parallelism, e=max_num_concurrent_tasks),
            )
            log_output = output.getvalue().strip()
            self.assertIn(
                "invalid value (-1)",
                log_output,
                """Invalid parallelism value -1 missing from log: {log_output}""".format(
                    log_output=log_output
                ),
            )
            self.assertIn(
                "max_num_concurrent_tasks ({c})".format(c=max_num_concurrent_tasks),
                log_output,
                """max_num_concurrent_tasks value missing from log: {log_output}""".format(
                    log_output=log_output
                ),
            )

        # Given invalidly large parallelism
        with patch_logger("hyperopt-spark") as output:
            parallelism = SparkTrials._decide_parallelism(
                max_num_concurrent_tasks, max_num_concurrent_tasks + 1
            )
            self.assertEqual(
                parallelism,
                max_num_concurrent_tasks,
                "Failed to limit parallelism ({p}) to max_num_concurrent_tasks"
                " ({e})".format(p=parallelism, e=max_num_concurrent_tasks),
            )
            log_output = output.getvalue().strip()
            self.assertIn(
                "parallelism ({p}) is greater".format(p=max_num_concurrent_tasks + 1),
                log_output,
                """User-specified parallelism ({p}) missing from log: {log_output}""".format(
                    p=max_num_concurrent_tasks + 1, log_output=log_output
                ),
            )
            self.assertIn(
                "max_num_concurrent_tasks ({c})".format(c=max_num_concurrent_tasks),
                log_output,
                """max_num_concurrent_tasks value missing from log: {log_output}""".format(
                    log_output=log_output
                ),
            )

        # Given valid parallelism
        parallelism = SparkTrials._decide_parallelism(max_num_concurrent_tasks, None)
        self.assertEqual(
            parallelism,
            max_num_concurrent_tasks,
            "The default parallelism ({p}) did not equal max_num_concurrent_tasks"
            " ({e})".format(p=parallelism, e=max_num_concurrent_tasks),
        )

        # Given invalid parallelism relative to hard cap
        with patch_logger("hyperopt-spark") as output:
            parallelism = SparkTrials._decide_parallelism(
                max_num_concurrent_tasks=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED + 1,
                parallelism=None,
            )
            self.assertEqual(
                parallelism,
                SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED,
                "Failed to limit parallelism ({p}) to MAX_CONCURRENT_JOBS_ALLOWED ({e})".format(
                    p=parallelism, e=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED
                ),
            )
            log_output = output.getvalue().strip()
            self.assertIn(
                "SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED ({c})".format(
                    c=SparkTrials.MAX_CONCURRENT_JOBS_ALLOWED
                ),
                log_output,
                """MAX_CONCURRENT_JOBS_ALLOWED value missing from log: {log_output}""".format(
                    log_output=log_output
                ),
            )