Пример #1
0
def determine_java_executable():
    """Will return the path of the java executable that will be used by Spark's
    tests or `None`"""

    # Any changes in the way that Spark's build detects java must be reflected
    # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
    # the `java` executable on the path

    java_home = os.environ.get("JAVA_HOME")

    # check if there is an executable at $JAVA_HOME/bin/java
    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
    # if the java_exe wasn't set, check for a `java` version on the $PATH
    return java_exe if java_exe else which("java")
Пример #2
0
def run_sparkr_tests():
    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")

    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
    else:
        print("Ignoring SparkR tests as R was not found in PATH")
Пример #3
0
def run_individual_python_test(test_name, pyspark_python):
    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
    LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    per_test_output = tempfile.TemporaryFile()
    retcode = subprocess.Popen(
        [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
        stderr=per_test_output, stdout=per_test_output, env=env).wait()
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        with FAILURE_REPORTING_LOCK:
            with open(LOG_FILE, 'ab') as log_file:
                per_test_output.seek(0)
                log_file.writelines(per_test_output.readlines())
            per_test_output.seek(0)
            for line in per_test_output:
                if not re.match('[0-9]+', line):
                    print(line, end='')
            per_test_output.close()
            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        per_test_output.close()
        LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
Пример #4
0
def get_default_python_executables():
    python_execs = [x for x in ["python2.7", "python3.4", "pypy"] if which(x)]
    if "python2.7" not in python_execs:
        LOGGER.warning("Not testing against `python2.7` because it could not be found; falling"
                       " back to `python` instead")
        python_execs.insert(0, "python")
    return python_execs
Пример #5
0
def kill_zinc_on_port(zinc_port):
    """
    Kill the Zinc process running on the given port, if one exists.
    """
    cmd = "%s -P |grep %s | grep LISTEN | awk '{ print $2; }' | xargs kill"
    lsof_exe = which("lsof")
    subprocess.check_call(cmd % (lsof_exe if lsof_exe else "/usr/sbin/lsof", zinc_port), shell=True)
Пример #6
0
def run_individual_python_test(test_name, pyspark_python):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
    })
    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    try:
        per_test_output = tempfile.TemporaryFile()
        retcode = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            stderr=per_test_output, stdout=per_test_output, env=env).wait()
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode()
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception("Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        per_test_output.close()
        LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
Пример #7
0
def get_default_python_executables():
    python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]
    if "python2.6" not in python_execs:
        print(
            "WARNING: Not testing against `python2.6` because it could not be found; falling"
            " back to `python` instead"
        )
        python_execs.insert(0, "python")
    return python_execs
Пример #8
0
def run_sparkr_style_checks():
    set_title_and_block("Running R style checks", "BLOCK_R_STYLE")

    if which("R"):
        # R style check should be executed after `install-dev.sh`.
        # Since warnings about `no visible global function definition` appear
        # without the installation. SEE ALSO: SPARK-9121.
        run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
    else:
        print("Ignoring SparkR style check as R was not found in PATH")
Пример #9
0
def build_spark_documentation():
    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
    os.environ["PRODUCTION"] = "1 jekyll build"

    os.chdir(os.path.join(SPARK_HOME, "docs"))

    jekyll_bin = which("jekyll")

    if not jekyll_bin:
        print("[error] Cannot find a version of `jekyll` on the system; please",
              " install one and retry to build documentation.")
        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    else:
        run_cmd([jekyll_bin, "build"])

    os.chdir(SPARK_HOME)
Пример #10
0
def build_spark_documentation():
    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
    os.environ["PRODUCTION"] = "1 jekyll build"

    os.chdir(os.path.join(SPARK_HOME, "docs"))

    jekyll_bin = which("jekyll")

    if not jekyll_bin:
        print(
            "[error] Cannot find a version of `jekyll` on the system; please",
            " install one and retry to build documentation.")
        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    else:
        run_cmd([jekyll_bin, "build"])

    os.chdir(SPARK_HOME)
Пример #11
0
def run_individual_python_test(test_name, pyspark_python):
    env = dict(os.environ)
    env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)})
    LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    try:
        per_test_output = tempfile.TemporaryFile()
        retcode = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            stderr=per_test_output,
            stdout=per_test_output,
            env=env).wait()
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name,
                         pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode()
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception(
                "Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." %
                      (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        per_test_output.close()
        LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name,
                    duration)
Пример #12
0
def run_individual_python_test(test_name, pyspark_python):
    env = {"SPARK_TESTING": "1", "PYSPARK_PYTHON": which(pyspark_python)}
    print("    Running test: %s ..." % test_name, end="")
    start_time = time.time()
    with open(LOG_FILE, "a") as log_file:
        retcode = subprocess.call(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=log_file, stdout=log_file, env=env
        )
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        with open(LOG_FILE, "r") as log_file:
            for line in log_file:
                if not re.match("[0-9]+", line):
                    print(line, end="")
        print_red("\nHad test failures in %s; see logs." % test_name)
        exit(-1)
    else:
        print("ok (%is)" % duration)
Пример #13
0
def run_individual_python_test(test_name, pyspark_python):
    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
    print("    Running test: %s ..." % test_name, end='')
    start_time = time.time()
    with open(LOG_FILE, 'a') as log_file:
        retcode = subprocess.call(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            stderr=log_file,
            stdout=log_file,
            env=env)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        with open(LOG_FILE, 'r') as log_file:
            for line in log_file:
                if not re.match('[0-9]+', line):
                    print(line, end='')
        print_red("\nHad test failures in %s; see logs." % test_name)
        exit(-1)
    else:
        print("ok (%is)" % duration)
Пример #14
0
def run_python_tests(test_modules, parallelism, with_coverage=False):
    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")

    if with_coverage:
        # Coverage makes the PySpark tests flaky due to heavy parallelism.
        # When we run PySpark tests with coverage, it uses 4 for now as
        # workaround.
        parallelism = 4
        script = "run-tests-with-coverage"
    else:
        script = "run-tests"
    command = [os.path.join(SPARK_HOME, "python", script)]
    if test_modules != [modules.root]:
        command.append("--modules=%s" % ','.join(m.name for m in test_modules))
    command.append("--parallelism=%i" % parallelism)
    if "GITHUB_ACTIONS" in os.environ:
        # See SPARK-33565. Python 3.9 was temporarily removed as its default Python executables
        # to test because of Jenkins environment issue. Once Jenkins has Python 3.9 to test,
        # we should remove this change back and add python3.9 into python/run-tests.py script.
        command.append("--python-executable=%s" %
                       ','.join(x for x in ["python3.9", "pypy3"] if which(x)))
    run_cmd(command)
Пример #15
0
def run_individual_python_test(target_dir, test_name, pyspark_python):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
    })

    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
    # recognized by the tempfile module to override the default system temp directory.
    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    while os.path.isdir(tmp_dir):
        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    os.mkdir(tmp_dir)
    env["TMPDIR"] = tmp_dir

    # Also override the JVM's temp directory by setting driver and executor options.
    spark_args = [
        "--conf",
        "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "--conf",
        "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "pyspark-shell"
    ]
    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)

    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    try:
        per_test_output = tempfile.TemporaryFile()
        retcode = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            stderr=per_test_output,
            stdout=per_test_output,
            env=env).wait()
        shutil.rmtree(tmp_dir, ignore_errors=True)
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name,
                         pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode("utf-8", "replace")
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception(
                "Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." %
                      (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        skipped_counts = 0
        try:
            per_test_output.seek(0)
            # Here expects skipped test output from unittest when verbosity level is
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode("utf-8", "replace"),
                                iter(per_test_output))
            skipped_tests = list(
                filter(
                    lambda line: re.search(
                        r'test_.* \(pyspark\..*\) ... skipped ', line),
                    decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0:
                key = (pyspark_python, test_name)
                assert SKIPPED_TESTS is not None
                SKIPPED_TESTS[key] = skipped_tests
            per_test_output.close()
        except:
            import traceback
            print_red("\nGot an exception while trying to store "
                      "skipped test output:\n%s" % traceback.format_exc())
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
        if skipped_counts != 0:
            LOGGER.info(
                "Finished test(%s): %s (%is) ... %s tests were skipped",
                pyspark_python, test_name, duration, skipped_counts)
        else:
            LOGGER.info("Finished test(%s): %s (%is)", pyspark_python,
                        test_name, duration)
Пример #16
0
def run_individual_python_test(target_dir, test_name, pyspark_python, failed_tests_deque):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
    })

    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
    # recognized by the tempfile module to override the default system temp directory.
    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    while os.path.isdir(tmp_dir):
        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    os.mkdir(tmp_dir)
    env["TMPDIR"] = tmp_dir

    # Also override the JVM's temp directory by setting driver and executor options.
    spark_args = [
        "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "pyspark-shell"
    ]
    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)

    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    # This line must come before the try block, file shouldn't be closed before 'worker' is done
    per_test_output = tempfile.TemporaryFile()
    try:
        process = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            # bufsize must be 0 (unbuffered), 1 (line buffered) doesn't seem to work
            stderr=subprocess.STDOUT, stdout=subprocess.PIPE, env=env, bufsize=0,
            universal_newlines=True)

        def consume_log(output):
            for line in process.stdout:
                print("({}) {} - {}".format(pyspark_python, test_name, line), end=b'')
                print(line, file=output, end=b'')

        worker = Thread(target=consume_log, args=(per_test_output,))
        worker.start()  # This is essential as we need to consume the stdout pipe
        retcode = process.wait()
        shutil.rmtree(tmp_dir, ignore_errors=True)
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(-1)
    duration = time.time() - start_time
    worker.join()  # Wait on the thread that consumed the output
    # If it failed, append output to LOG_FILE, add to failed tests and carry on
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode()
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception("Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
            failed_tests_deque.append((test_name, pyspark_python))
    else:
        skipped_counts = 0
        try:
            per_test_output.seek(0)
            # Here expects skipped test output from unittest when verbosity level is
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
            skipped_tests = list(filter(
                lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line),
                decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0:
                key = (pyspark_python, test_name)
                SKIPPED_TESTS[key] = skipped_tests
            per_test_output.close()
        except:
            import traceback
            print_red("\nGot an exception while trying to store "
                      "skipped test output:\n%s" % traceback.format_exc())
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
        if skipped_counts != 0:
            LOGGER.info(
                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
                duration, skipped_counts)
        else:
            LOGGER.info(
                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
Пример #17
0
def run_individual_python_test(test_name, pyspark_python):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
    })
    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    try:
        per_test_output = tempfile.TemporaryFile()
        retcode = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
            stderr=per_test_output,
            stdout=per_test_output,
            env=env).wait()
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name,
                         pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode()
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception(
                "Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." %
                      (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        skipped_counts = 0
        try:
            per_test_output.seek(0)
            # Here expects skipped test output from unittest when verbosity level is
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode(),
                                iter(per_test_output))
            skipped_tests = list(
                filter(
                    lambda line: re.search(
                        'test_.* \(pyspark\..*\) ... skipped ', line),
                    decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0:
                key = (pyspark_python, test_name)
                SKIPPED_TESTS[key] = skipped_tests
            per_test_output.close()
        except:
            import traceback
            print_red("\nGot an exception while trying to store "
                      "skipped test output:\n%s" % traceback.format_exc())
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
        if skipped_counts != 0:
            LOGGER.info(
                "Finished test(%s): %s (%is) ... %s tests were skipped",
                pyspark_python, test_name, duration, skipped_counts)
        else:
            LOGGER.info("Finished test(%s): %s (%is)", pyspark_python,
                        test_name, duration)
Пример #18
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala")
                                or f.endswith("scalastyle-config.xml")
                                for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java")
                                or f.endswith("checkstyle.xml")
                                or f.endswith("checkstyle-suppressions.xml")
                                for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Пример #19
0
def run_individual_python_test(target_dir, test_name, pyspark_python):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
    })

    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
    # recognized by the tempfile module to override the default system temp directory.
    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    while os.path.isdir(tmp_dir):
        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    os.mkdir(tmp_dir)
    env["TMPDIR"] = tmp_dir

    # Also override the JVM's temp directory by setting driver and executor options.
    spark_args = [
        "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
        "pyspark-shell"
    ]
    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)

    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
    start_time = time.time()
    try:
        per_test_output = tempfile.TemporaryFile()
        retcode = subprocess.Popen(
            [os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(),
            stderr=per_test_output, stdout=per_test_output, env=env).wait()
        shutil.rmtree(tmp_dir, ignore_errors=True)
    except:
        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode()
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except:
            LOGGER.exception("Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        skipped_counts = 0
        try:
            per_test_output.seek(0)
            # Here expects skipped test output from unittest when verbosity level is
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
            skipped_tests = list(filter(
                lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line),
                decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0:
                key = (pyspark_python, test_name)
                SKIPPED_TESTS[key] = skipped_tests
            per_test_output.close()
        except:
            import traceback
            print_red("\nGot an exception while trying to store "
                      "skipped test output:\n%s" % traceback.format_exc())
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
        if skipped_counts != 0:
            LOGGER.info(
                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
                duration, skipped_counts)
        else:
            LOGGER.info(
                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
Пример #20
0
def run_individual_python_test(target_dir, test_name, pyspark_python):
    env = dict(os.environ)
    env.update({
        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
        'SPARK_TESTING': '1',
        'SPARK_PREPEND_CLASSES': '1',
        'PYSPARK_PYTHON': which(pyspark_python),
        'PYSPARK_DRIVER_PYTHON': which(pyspark_python),
        # Preserve legacy nested timezone behavior for pyarrow>=2, remove after SPARK-32285
        'PYARROW_IGNORE_TIMEZONE': '1',
    })

    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
    # recognized by the tempfile module to override the default system temp directory.
    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    while os.path.isdir(tmp_dir):
        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
    os.mkdir(tmp_dir)
    env["TMPDIR"] = tmp_dir
    metastore_dir = os.path.join(tmp_dir, str(uuid.uuid4()))
    while os.path.isdir(metastore_dir):
        metastore_dir = os.path.join(metastore_dir, str(uuid.uuid4()))
    os.mkdir(metastore_dir)

    # Also override the JVM's temp directory by setting driver and executor options.
    java_options = "-Djava.io.tmpdir={0}".format(tmp_dir)
    java_options = java_options + " -Dio.netty.tryReflectionSetAccessible=true -Xss4M"
    spark_args = [
        "--conf", "spark.driver.extraJavaOptions='{0}'".format(java_options),
        "--conf", "spark.executor.extraJavaOptions='{0}'".format(java_options),
        "--conf", "spark.sql.warehouse.dir='{0}'".format(metastore_dir),
        "pyspark-shell"
    ]
    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)

    output_prefix = get_valid_filename(pyspark_python + "__" + test_name +
                                       "__").lstrip("_")
    per_test_output = tempfile.NamedTemporaryFile(prefix=output_prefix,
                                                  suffix=".log")
    LOGGER.info("Starting test(%s): %s (temp output: %s)", pyspark_python,
                test_name, per_test_output.name)
    start_time = time.time()
    try:
        retcode = subprocess.Popen([os.path.join(SPARK_HOME, "bin/pyspark")] +
                                   test_name.split(),
                                   stderr=per_test_output,
                                   stdout=per_test_output,
                                   env=env).wait()
        # There exists a race condition in Python and it causes flakiness in MacOS
        # https://github.com/python/cpython/issues/73885
        if platform.system() == "Darwin":
            os.system("rm -rf " + tmp_dir)
        else:
            shutil.rmtree(tmp_dir, ignore_errors=True)
    except BaseException:
        LOGGER.exception("Got exception while running %s with %s", test_name,
                         pyspark_python)
        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
        # this code is invoked from a thread other than the main thread.
        os._exit(1)
    duration = time.time() - start_time
    # Exit on the first failure.
    if retcode != 0:
        try:
            with FAILURE_REPORTING_LOCK:
                with open(LOG_FILE, 'ab') as log_file:
                    per_test_output.seek(0)
                    log_file.writelines(per_test_output)
                per_test_output.seek(0)
                for line in per_test_output:
                    decoded_line = line.decode("utf-8", "replace")
                    if not re.match('[0-9]+', decoded_line):
                        print(decoded_line, end='')
                per_test_output.close()
        except BaseException:
            LOGGER.exception(
                "Got an exception while trying to print failed test output")
        finally:
            print_red("\nHad test failures in %s with %s; see logs." %
                      (test_name, pyspark_python))
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
    else:
        skipped_counts = 0
        try:
            per_test_output.seek(0)
            # Here expects skipped test output from unittest when verbosity level is
            # 2 (or --verbose option is enabled).
            decoded_lines = map(lambda line: line.decode("utf-8", "replace"),
                                iter(per_test_output))
            skipped_tests = list(
                filter(
                    lambda line: re.search(
                        r'test_.* \(pyspark\..*\) ... (skip|SKIP)', line),
                    decoded_lines))
            skipped_counts = len(skipped_tests)
            if skipped_counts > 0:
                key = (pyspark_python, test_name)
                assert SKIPPED_TESTS is not None
                SKIPPED_TESTS[key] = skipped_tests
            per_test_output.close()
        except BaseException:
            import traceback
            print_red("\nGot an exception while trying to store "
                      "skipped test output:\n%s" % traceback.format_exc())
            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
            # this code is invoked from a thread other than the main thread.
            os._exit(-1)
        if skipped_counts != 0:
            LOGGER.info(
                "Finished test(%s): %s (%is) ... %s tests were skipped",
                pyspark_python, test_name, duration, skipped_counts)
        else:
            LOGGER.info("Finished test(%s): %s (%is)", pyspark_python,
                        test_name, duration)
Пример #21
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print(
            "[error] Cannot determine your home directory as an absolute path;",
            " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # Install SparkR
    should_only_test_modules = opts.modules is not None
    test_modules = []
    if should_only_test_modules:
        str_test_modules = [m.strip() for m in opts.modules.split(",")]
        test_modules = [
            m for m in modules.all_modules if m.name in str_test_modules
        ]

    if not should_only_test_modules or modules.sparkr in test_modules:
        # If tests modules are specified, we will not run R linter.
        # SparkR needs the manual SparkR installation.
        if which("R"):
            run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
        else:
            print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        scala_version = os.environ.get("AMPLAB_JENKINS_BUILD_SCALA_PROFILE")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE",
                                        "hadoop3.2")
        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE",
                                      "hive2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get(
            "PATH")
    else:
        # else we're running locally or GitHub Actions.
        build_tool = "sbt"
        scala_version = os.environ.get("SCALA_PROFILE")
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
        if "GITHUB_ACTIONS" in os.environ:
            test_env = "github_actions"
        else:
            test_env = "local"

    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) + \
        get_scala_profiles(scala_version)

    print("[info] Using build tool", build_tool, "with profiles",
          *(extra_profiles + ["under environment", test_env]))

    changed_modules = []
    changed_files = []
    included_tags = []
    excluded_tags = []
    if should_only_test_modules:
        # We're likely in the forked repository
        is_apache_spark_ref = os.environ.get("APACHE_SPARK_REF", "") != ""
        # We're likely in the main repo build.
        is_github_prev_sha = os.environ.get("GITHUB_PREV_SHA", "") != ""
        # Otherwise, we're in either periodic job in Github Actions or somewhere else.

        # If we're running the tests in GitHub Actions, attempt to detect and test
        # only the affected modules.
        if test_env == "github_actions" and (is_apache_spark_ref
                                             or is_github_prev_sha):
            if is_apache_spark_ref:
                changed_files = identify_changed_files_from_git_commits(
                    "HEAD", target_ref=os.environ["APACHE_SPARK_REF"])
            elif is_github_prev_sha:
                changed_files = identify_changed_files_from_git_commits(
                    os.environ["GITHUB_SHA"],
                    target_ref=os.environ["GITHUB_PREV_SHA"])

            modules_to_test = determine_modules_to_test(
                determine_modules_for_files(changed_files), deduplicated=False)

            if modules.root not in modules_to_test:
                # If root module is not found, only test the intersected modules.
                # If root module is found, just run the modules as specified initially.
                test_modules = list(
                    set(modules_to_test).intersection(test_modules))

        changed_modules = test_modules
        if len(changed_modules) == 0:
            print(
                "[info] There are no modules to test, exiting without testing."
            )
            return

    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
    # detect modules to test.
    elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits(
            "HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        test_modules = determine_modules_to_test(changed_modules)
        excluded_tags = determine_tags_to_exclude(changed_modules)

    # If there is no changed module found, tests all.
    if not changed_modules:
        changed_modules = [modules.root]
    if not test_modules:
        test_modules = determine_modules_to_test(changed_modules)

    if opts.excluded_tags:
        excluded_tags.extend(
            [t.strip() for t in opts.excluded_tags.split(",")])
    if opts.included_tags:
        included_tags.extend(
            [t.strip() for t in opts.included_tags.split(",")])

    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    if scala_version is not None:
        # If not set, assume this is default and doesn't need to change.
        switch_scala_version(scala_version)

    should_run_java_style_checks = False
    if not should_only_test_modules:
        # license checks
        run_apache_rat_checks()

        # style checks
        if not changed_files or any(
                f.endswith(".scala") or f.endswith("scalastyle-config.xml")
                for f in changed_files):
            run_scala_style_checks(extra_profiles)
        if not changed_files or any(
                f.endswith(".java") or f.endswith("checkstyle.xml")
                or f.endswith("checkstyle-suppressions.xml")
                for f in changed_files):
            # Run SBT Checkstyle after the build to prevent a side-effect to the build.
            should_run_java_style_checks = True
        if not changed_files or any(
                f.endswith("lint-python") or f.endswith("tox.ini")
                or f.endswith(".py") for f in changed_files):
            run_python_style_checks()
        if not changed_files or any(
                f.endswith(".R") or f.endswith("lint-r")
                or f.endswith(".lintr") for f in changed_files):
            run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get the Bundler gem installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests
           for m in test_modules) and test_env != "amplab_jenkins":
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, extra_profiles)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        if not os.environ.get("SKIP_MIMA"):
            detect_binary_inop_with_mima(extra_profiles)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags,
                    included_tags)

    modules_with_python_tests = [
        m for m in test_modules if m.python_test_goals
    ]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(modules_with_python_tests,
                         opts.parallelism,
                         with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Пример #22
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala")
                                or f.endswith("scalastyle-config.xml")
                                for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java")
                                or f.endswith("checkstyle.xml")
                                or f.endswith("checkstyle-suppressions.xml")
                                for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Пример #23
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "and Hive profile", hive_version, "under environment", test_env)
    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

    changed_modules = None
    changed_files = None
    should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
    included_tags = []
    if should_only_test_modules:
        str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
        test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
        # Directly uses test_modules as changed modules to apply tags and environments
        # as if all specified test modules are changed.
        changed_modules = test_modules
        str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
        str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
        excluded_tags = []
        if str_excluded_tags:
            excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
        included_tags = []
        if str_included_tags:
            included_tags = [t.strip() for t in str_included_tags.split(",")]
    elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)

    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    should_run_java_style_checks = False
    if not should_only_test_modules:
        test_modules = determine_modules_to_test(changed_modules)

        # license checks
        run_apache_rat_checks()

        # style checks
        if not changed_files or any(f.endswith(".scala")
                                    or f.endswith("scalastyle-config.xml")
                                    for f in changed_files):
            run_scala_style_checks(extra_profiles)
        if not changed_files or any(f.endswith(".java")
                                    or f.endswith("checkstyle.xml")
                                    or f.endswith("checkstyle-suppressions.xml")
                                    for f in changed_files):
            # Run SBT Checkstyle after the build to prevent a side-effect to the build.
            should_run_java_style_checks = True
        if not changed_files or any(f.endswith("lint-python")
                                    or f.endswith("tox.ini")
                                    or f.endswith(".py")
                                    for f in changed_files):
            run_python_style_checks()
        if not changed_files or any(f.endswith(".R")
                                    or f.endswith("lint-r")
                                    or f.endswith(".lintr")
                                    for f in changed_files):
            run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, extra_profiles)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(extra_profiles)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(
            modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()