示例#1
0
def _check_dependencies(python_exec, modules_to_test):
    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
    # explicitly prints out. See SPARK-23300.
    if pyspark_sql in modules_to_test:
        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
        minimum_pyarrow_version = '0.8.0'
        minimum_pandas_version = '0.19.2'

        try:
            pyarrow_version = subprocess_check_output([
                python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"
            ],
                                                      universal_newlines=True,
                                                      stderr=open(
                                                          os.devnull,
                                                          'w')).strip()
            if LooseVersion(pyarrow_version) >= LooseVersion(
                    minimum_pyarrow_version):
                LOGGER.info(
                    "Will test PyArrow related features against Python executable "
                    "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
            else:
                LOGGER.warning(
                    "Will skip PyArrow related features against Python executable "
                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
                    "%s was found." %
                    (python_exec, pyspark_sql.name, minimum_pyarrow_version,
                     pyarrow_version))
        except:
            LOGGER.warning(
                "Will skip PyArrow related features against Python executable "
                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
                "was not found." %
                (python_exec, pyspark_sql.name, minimum_pyarrow_version))

        try:
            pandas_version = subprocess_check_output([
                python_exec, "-c", "import pandas; print(pandas.__version__)"
            ],
                                                     universal_newlines=True,
                                                     stderr=open(
                                                         os.devnull,
                                                         'w')).strip()
            if LooseVersion(pandas_version) >= LooseVersion(
                    minimum_pandas_version):
                LOGGER.info(
                    "Will test Pandas related features against Python executable "
                    "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
            else:
                LOGGER.warning(
                    "Will skip Pandas related features against Python executable "
                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
                    "%s was found." % (python_exec, pyspark_sql.name,
                                       minimum_pandas_version, pandas_version))
        except:
            LOGGER.warning(
                "Will skip Pandas related features against Python executable "
                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
                "was not found." %
                (python_exec, pyspark_sql.name, minimum_pandas_version))
示例#2
0
文件: run-tests.py 项目: LY3918/spark
def _check_dependencies(python_exec, modules_to_test):
    if "COVERAGE_PROCESS_START" in os.environ:
        # Make sure if coverage is installed.
        try:
            subprocess_check_output(
                [python_exec, "-c", "import coverage"],
                stderr=open(os.devnull, 'w'))
        except:
            print_red("Coverage is not installed in Python executable '%s' "
                      "but 'COVERAGE_PROCESS_START' environment variable is set, "
                      "exiting." % python_exec)
            sys.exit(-1)

    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
    # explicitly prints out. See SPARK-23300.
    if pyspark_sql in modules_to_test:
        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
        minimum_pyarrow_version = '0.8.0'
        minimum_pandas_version = '0.19.2'

        try:
            pyarrow_version = subprocess_check_output(
                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
                universal_newlines=True,
                stderr=open(os.devnull, 'w')).strip()
            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
                LOGGER.info("Will test PyArrow related features against Python executable "
                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
            else:
                LOGGER.warning(
                    "Will skip PyArrow related features against Python executable "
                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
                    "%s was found." % (
                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
        except:
            LOGGER.warning(
                "Will skip PyArrow related features against Python executable "
                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))

        try:
            pandas_version = subprocess_check_output(
                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
                universal_newlines=True,
                stderr=open(os.devnull, 'w')).strip()
            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
                LOGGER.info("Will test Pandas related features against Python executable "
                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
            else:
                LOGGER.warning(
                    "Will skip Pandas related features against Python executable "
                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
                    "%s was found." % (
                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
        except:
            LOGGER.warning(
                "Will skip Pandas related features against Python executable "
                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
示例#3
0
文件: run-tests.py 项目: 0xqq/spark
def main():
    opts = parse_opts()
    if (opts.verbose):
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    modules_to_test = []
    for module_name in opts.modules.split(','):
        if module_name in python_modules:
            modules_to_test.append(python_modules[module_name])
        else:
            print("Error: unrecognized module '%s'. Supported modules: %s" %
                  (module_name, ", ".join(python_modules)))
            sys.exit(-1)
    LOGGER.info("Will test against the following Python executables: %s", python_execs)
    LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])

    task_queue = Queue.Queue()
    for python_exec in python_execs:
        python_implementation = subprocess_check_output(
            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
            universal_newlines=True).strip()
        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
        for module in modules_to_test:
            if python_implementation not in module.blacklisted_python_implementations:
                for test_goal in module.python_test_goals:
                    task_queue.put((python_exec, test_goal))

    def process_queue(task_queue):
        while True:
            try:
                (python_exec, test_goal) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue,))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)
示例#4
0
def main():
    opts = parse_opts()
    if (opts.verbose):
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    modules_to_test = []
    for module_name in opts.modules.split(','):
        if module_name in snappy_python_modules:
            modules_to_test.append(module_name)
        else:
            print("Error: unrecognized module '%s'. Supported modules: %s" %
                  (module_name, ", ".join(snappy_python_modules)))
            sys.exit(-1)
    LOGGER.info("Will test against the following Python executables: %s", python_execs)
    LOGGER.info("Will test the following Python modules: %s", [x for x in modules_to_test])

    task_queue = Queue.PriorityQueue()
    for python_exec in python_execs:
        python_implementation = subprocess_check_output(
            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
            universal_newlines=True).strip()
        LOGGER.info("%s python_implementation is %s", python_exec, python_implementation)
        LOGGER.info("%s version is: %s", python_exec, subprocess_check_output(
            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
        for module in modules_to_test:
            test_goal = python_test_goals[module]
            task_queue.put((0, (python_exec, test_goal)))

    def process_queue(task_queue):
        while True:
            try:
                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue,))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)
示例#5
0
def _check_coverage(python_exec):
    # Make sure if coverage is installed.
    try:
        subprocess_check_output([python_exec, "-c", "import coverage"],
                                stderr=open(os.devnull, 'w'))
    except:
        print_red("Coverage is not installed in Python executable '%s' "
                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
                  "exiting." % python_exec)
        sys.exit(-1)
示例#6
0
def _check_coverage(python_exec):
    # Make sure if coverage is installed.
    try:
        subprocess_check_output(
            [python_exec, "-c", "import coverage"],
            stderr=open(os.devnull, 'w'))
    except:
        print_red("Coverage is not installed in Python executable '%s' "
                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
                  "exiting." % python_exec)
        sys.exit(-1)
示例#7
0
def main():
    opts = parse_opts()
    if (opts.verbose):
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(stream=sys.stdout,
                        level=log_level,
                        format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    modules_to_test = []
    for module_name in opts.modules.split(','):
        if module_name in python_modules:
            modules_to_test.append(python_modules[module_name])
        else:
            print("Error: unrecognized module '%s'. Supported modules: %s" %
                  (module_name, ", ".join(python_modules)))
            sys.exit(-1)
    LOGGER.info("Will test against the following Python executables: %s",
                python_execs)
    LOGGER.info("Will test the following Python modules: %s",
                [x.name for x in modules_to_test])

    task_queue = Queue.PriorityQueue()
    for python_exec in python_execs:
        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
        # environmental variable is set.
        if "COVERAGE_PROCESS_START" in os.environ:
            _check_coverage(python_exec)

        python_implementation = subprocess_check_output(
            [
                python_exec, "-c",
                "import platform; print(platform.python_implementation())"
            ],
            universal_newlines=True).strip()
        LOGGER.debug("%s python_implementation is %s", python_exec,
                     python_implementation)
        LOGGER.debug(
            "%s version is: %s", python_exec,
            subprocess_check_output([python_exec, "--version"],
                                    stderr=subprocess.STDOUT,
                                    universal_newlines=True).strip())
        for module in modules_to_test:
            if python_implementation not in module.blacklisted_python_implementations:
                for test_goal in module.python_test_goals:
                    if test_goal in ('pyspark.streaming.tests',
                                     'pyspark.mllib.tests', 'pyspark.tests',
                                     'pyspark.sql.tests'):
                        priority = 0
                    else:
                        priority = 100
                    task_queue.put((priority, (python_exec, test_goal)))

    # Create the target directory before starting tasks to avoid races.
    target_dir = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'target'))
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)

    def process_queue(task_queue):
        while True:
            try:
                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(target_dir, test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue, ))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)

    for key, lines in sorted(SKIPPED_TESTS.items()):
        pyspark_python, test_name = key
        LOGGER.info("\nSkipped tests in %s with %s:" %
                    (test_name, pyspark_python))
        for line in lines:
            LOGGER.info("    %s" % line.rstrip())
示例#8
0
def main():
    opts = parse_opts()
    if opts.verbose:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    should_test_modules = opts.testnames is None
    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    LOGGER.info("Will test against the following Python executables: %s", python_execs)

    if should_test_modules:
        modules_to_test = []
        for module_name in opts.modules.split(','):
            if module_name in python_modules:
                modules_to_test.append(python_modules[module_name])
            else:
                print("Error: unrecognized module '%s'. Supported modules: %s" %
                      (module_name, ", ".join(python_modules)))
                sys.exit(-1)
        LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
    else:
        testnames_to_test = opts.testnames.split(',')
        LOGGER.info("Will test the following Python tests: %s", testnames_to_test)

    task_queue = Queue.PriorityQueue()
    for python_exec in python_execs:
        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
        # environmental variable is set.
        if "COVERAGE_PROCESS_START" in os.environ:
            _check_coverage(python_exec)

        python_implementation = subprocess_check_output(
            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
            universal_newlines=True).strip()
        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
        if should_test_modules:
            for module in modules_to_test:
                if python_implementation not in module.blacklisted_python_implementations:
                    for test_goal in module.python_test_goals:
                        heavy_tests = ['pyspark.streaming.tests', 'pyspark.mllib.tests',
                                       'pyspark.tests', 'pyspark.sql.tests', 'pyspark.ml.tests']
                        if any(map(lambda prefix: test_goal.startswith(prefix), heavy_tests)):
                            priority = 0
                        else:
                            priority = 100
                        task_queue.put((priority, (python_exec, test_goal)))
        else:
            for test_goal in testnames_to_test:
                task_queue.put((0, (python_exec, test_goal)))

    # Create the target directory before starting tasks to avoid races.
    target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target'))
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)

    def process_queue(task_queue):
        while True:
            try:
                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(target_dir, test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue,))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)

    for key, lines in sorted(SKIPPED_TESTS.items()):
        pyspark_python, test_name = key
        LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python))
        for line in lines:
            LOGGER.info("    %s" % line.rstrip())
示例#9
0
    env = get_build_environment()
    mtt = modules_to_test(env)

    circleNodeIndex = os.getenv("CIRCLE_NODE_INDEX")
    circleNodeTotal = os.getenv("CIRCLE_NODE_TOTAL")
    if circleNodeTotal is not None:
        length = len(all_python_executables)
        fromExec = int(circleNodeIndex) * length / int(circleNodeTotal)
        toExec = (int(circleNodeIndex) + 1) * length / int(circleNodeTotal)
        python_executables_for_run = all_python_executables[fromExec:toExec]
    else:
        python_executables_for_run = all_python_executables

    LOGGER.info("Testing following python executables in this run: %s", python_executables_for_run)

    modules_with_python_tests = [m for m in mtt.test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, 8, python_executables_for_run)

        # Packaging tests create a conda environment for each python version
        # We'd like to use the same version that our executables above use
        python_exact_versions = [
            subprocess_check_output(
                [python_exec, "-c", "import platform; print(platform.python_version())"],
                universal_newlines=True).strip()
            for python_exec in python_executables_for_run
        ]
        LOGGER.info("Running python packaging tests for following python versions using conda: %s",
                    python_exact_versions)
        run_python_packaging_tests(use_conda=True, python_versions=python_exact_versions)