Пример #1
0
def extract_all():
    """
    Extract features using the docker image across all courses and all sessions except holdout.
    :return:
    """
    mode = "extract"
    raw_data_buckets = fetch_data_buckets_from_config()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode)
    # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image
    run_job(docker_url,
            mode,
            course=None,
            user=user_id,
            job_id=job_id,
            session=None,
            level="all",
            raw_data_buckets=raw_data_buckets)
    result_file = collect_all_results(s3, raw_data_buckets, proc_data_bucket,
                                      mode, user_id, job_id)
    upload_key = make_s3_key_path(user_id,
                                  job_id,
                                  mode,
                                  course=None,
                                  filename=result_file)
    upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key)
    os.remove(result_file)
    send_email_alert(aws_access_key_id,
                     aws_secret_access_key,
                     job_id,
                     user_id,
                     status=mode,
                     emailaddr_to=email_to)
    return
Пример #2
0
def test_all(label_type):
    """
    test a single overall model using the entire dataset using the Docker image.
    :return:
    """
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    run_image(job_config,
              job_config.raw_data_buckets,
              level=level,
              label_type=label_type)
    # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow
    result_file = collect_all_results(job_config)
    upload_key = make_s3_key_path(job_config,
                                  filename=generate_archive_filename(
                                      job_config, extension="csv"))
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #3
0
def run_morf_job(client_config_url, server_config_url, email_to = None, no_cache = False):
    """
    Wrapper function to run complete MORF job.
    :param client_config_url: url to client.config file; should be located on local machine.
    :param server_config_url: url (local or s3) to server.config file.
    :return:
    """
    controller_script_name = "controller.py"
    docker_image_name = "docker_image"
    server_config_path = urlparse(server_config_url).path
    # read server.config and get those properties
    server_config = get_config_properties(server_config_path)
    # create temporary directory in local_working_directory from server.config
    with tempfile.TemporaryDirectory(dir=server_config["local_working_directory"]) as working_dir:
        # save calling working directory; change directory into working_dir
        calling_dir = os.getcwd()
        os.chdir(working_dir)
        # download client.config into local_working_directory using AWS creds from server.config
        s3 = boto3.client("s3", aws_access_key_id=server_config["aws_access_key_id"],
                          aws_secret_access_key=server_config["aws_secret_access_key"])
        fetch_file(s3, working_dir, client_config_url)
        local_client_config_path = os.path.join(os.getcwd(), "client.config")
        combine_config_files(server_config_path, local_client_config_path)
        config = get_config_properties()
        if email_to: # if email_to was provided, this overrides in config file -- allows users to easily run mwe
            print("[INFO] email address from submission {} overriding email address in config file {}"
                  .format(email_to, config["email_to"]))
            config["email_to"] = email_to
            update_config_fields_in_section("client", email_to = email_to)
        cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"])
        # from client.config, fetch and download the following: docker image, controller script
        try:
            fetch_file(s3, working_dir, config["docker_url"], dest_filename = docker_image_name)
            fetch_file(s3, working_dir, config["controller_url"], dest_filename = controller_script_name)
            if not no_cache: # cache job files in s3 unless no_cache parameter set to true
                cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"],
                                     docker_image_name)
                cache_job_file_in_s3(s3, config["user_id"], config["job_id"], config["proc_data_bucket"],
                                     controller_script_name)
        except KeyError as e:
            cause = e.args[0]
            print("[Error]: field {} missing from client.config file.".format(cause))
            sys.exit(-1)
        # change working directory and run controller script with notifications for initialization and completion
        send_email_alert(config["aws_access_key_id"],
                         config["aws_secret_access_key"],
                         config["job_id"],
                         config["user_id"],
                         status = "INITIALIZED",
                         emailaddr_to=config["email_to"])
        subprocess.call("python3 {}".format(controller_script_name), shell = True)
        send_success_email(config["aws_access_key_id"],
                           config["aws_secret_access_key"],
                           config["proc_data_bucket"],
                           config["job_id"], config["user_id"], config["email_to"])
        return
Пример #4
0
def run_morf_job(job_config, no_cache=False):
    """
    Wrapper function to run complete MORF job.
    :param client_config_url: url to client.config file.
    :param server_config_url: url to server.config file.
    :return:
    """
    combined_config_filename = "config.properties"
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("running job id: {}".format(job_config.morf_id))
    controller_script_name = "controller.py"
    docker_image_name = "docker_image"
    s3 = job_config.initialize_s3()
    # create temporary directory in local_working_directory from server.config
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        # copy config file into new directory
        shutil.copy(combined_config_filename, working_dir)
        os.chdir(working_dir)
        # from job_config, fetch and download the following: docker image, controller script, cached config file
        update_morf_job_cache(job_config)
        # from client.config, fetch and download the following: docker image, controller script
        try:
            fetch_file(s3,
                       working_dir,
                       job_config.docker_url,
                       dest_filename=docker_image_name,
                       job_config=job_config)
            fetch_file(s3,
                       working_dir,
                       job_config.controller_url,
                       dest_filename=controller_script_name,
                       job_config=job_config)
            if not no_cache:  # cache job files in s3 unless no_cache parameter set to true
                cache_job_file_in_s3(job_config, filename=docker_image_name)
                cache_job_file_in_s3(job_config,
                                     filename=controller_script_name)
        except KeyError as e:
            cause = e.args[0]
            logger.error(
                "[Error]: field {} missing from client.config file.".format(
                    cause))
            sys.exit(-1)
        # change working directory and run controller script with notifications for initialization and completion
        job_config.update_status("INITIALIZED")
        send_email_alert(job_config)
        subprocess.call("python3 {}".format(controller_script_name),
                        shell=True)
        job_config.update_status("SUCCESS")
        send_success_email(job_config)
        return
Пример #5
0
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True):
    """
    Extract features using the Docker image across each course of holdout data.
    :return:
    """
    mode = "extract-holdout"
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # call job_runner once percourse with --mode=extract and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                holdout_session = fetch_sessions(
                    job_config,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, holdout_session,
                    level, None
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config, filename=result_file)
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #6
0
def test_course(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    tests one model per course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return:
    """
    level = "course"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    ## for each bucket, call job_runner once per course with --mode=test and --level=course
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("[INFO] processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, None, level,
                    label_type
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    result_file = collect_course_results(job_config)
    upload_key = make_s3_key_path(job_config,
                                  filename=generate_archive_filename(
                                      job_config, extension="csv"))
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #7
0
def train_all(label_type):
    """
    Train a single overall model using the entire dataset using the Docker image.
    :param label_type:  label type provided by user.
    :return: None
    """
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode("train")
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    run_image(job_config,
              raw_data_bucket=job_config.raw_data_buckets,
              level=level,
              label_type=label_type)
    send_email_alert(job_config)
    return
Пример #8
0
def test_course(raw_data_dir="morf-data/"):
    """
    tests one model per course using the Docker image.
    :return:
    """
    raw_data_buckets = fetch_data_buckets_from_config()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode)
    ## for each bucket, call job_runner once per course with --mode=test and --level=course
    for raw_data_bucket in raw_data_buckets:
        print("[INFO] processing bucket {}".format(raw_data_bucket))
        with Pool() as pool:
            for course in fetch_complete_courses(s3,
                                                 raw_data_bucket,
                                                 raw_data_dir,
                                                 n_train=1):
                pool.apply_async(run_job, [
                    docker_url, mode, course, user_id, job_id, None, "course",
                    raw_data_bucket
                ])
            pool.close()
            pool.join()
    result_file = collect_course_results(s3, raw_data_buckets,
                                         proc_data_bucket, mode, user_id,
                                         job_id)
    upload_key = make_s3_key_path(user_id,
                                  job_id,
                                  mode,
                                  course=None,
                                  filename=generate_archive_filename(
                                      user_id=user_id,
                                      job_id=job_id,
                                      mode=mode,
                                      extension="csv"))
    upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key)
    os.remove(result_file)
    send_email_alert(aws_access_key_id,
                     aws_secret_access_key,
                     job_id,
                     user_id,
                     status=mode,
                     emailaddr_to=email_to)
    return
Пример #9
0
def extract_all():
    """
    Extract features using the docker image across all courses and all sessions except holdout.
    :return:
    """
    mode = "extract"
    level = "all"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    # only call job_runner once with --mode-extract and --level=all; this will load ALL data up and run the docker image
    run_image(job_config, job_config.raw_data_buckets, level=level)
    result_file = collect_all_results(job_config)
    upload_key = make_s3_key_path(job_config, filename=result_file)
    upload_file_to_s3(result_file,
                      bucket=job_config.proc_data_bucket,
                      key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #10
0
def train_session(label_type, raw_data_dir="morf-data/", multithread=True):
    """
    Train one model per session of the course using the Docker image.
    :param label_type:  label type provided by user.
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return: None
    """
    level = "session"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    check_label_type(label_type)
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    # for each bucket, call job_runner once per session with --mode=train and --level=session
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_complete_courses(job_config, raw_data_bucket,
                                         raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                for session in fetch_sessions(job_config, raw_data_bucket,
                                              raw_data_dir, course):
                    poolres = pool.apply_async(run_image, [
                        job_config, raw_data_bucket, course, session, level,
                        label_type
                    ])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    send_email_alert(job_config)
    return
Пример #11
0
def test_all():
    """
    test a single overall model using the entire dataset using the Docker image.
    :return:
    """
    raw_data_buckets = fetch_data_buckets_from_config()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode)
    run_job(docker_url,
            mode,
            None,
            user_id,
            job_id,
            None,
            "all",
            None,
            raw_data_buckets=raw_data_buckets)
    # fetch archived result file and push csv result back to s3, mimicking session- and course-level workflow
    result_file = collect_all_results(s3, proc_data_bucket, mode, user_id,
                                      job_id)
    upload_key = make_s3_key_path(user_id,
                                  job_id,
                                  mode,
                                  course=None,
                                  filename=generate_archive_filename(
                                      user_id=user_id,
                                      job_id=job_id,
                                      mode=mode,
                                      extension="csv"))
    upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key)
    os.remove(result_file)
    send_email_alert(aws_access_key_id,
                     aws_secret_access_key,
                     job_id,
                     user_id,
                     status=mode,
                     emailaddr_to=email_to)
    return
Пример #12
0
def extract_course(raw_data_dir="morf-data/"):
    """
    Extract features using the Docker image, building individual feature sets for each course.
    :return:
    """
    mode = "extract"
    raw_data_buckets = fetch_data_buckets_from_config()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode)
    # call job_runner once percourse with --mode=extract and --level=course
    for raw_data_bucket in raw_data_buckets:
        print("[INFO] processing bucket {}".format(raw_data_bucket))
        with Pool() as pool:
            for course in fetch_courses(s3, raw_data_bucket, raw_data_dir):
                pool.apply_async(run_job, [
                    docker_url, mode, course, user_id, job_id, None, "course",
                    raw_data_bucket
                ])
            pool.close()
            pool.join()
    result_file = collect_course_results(s3, raw_data_buckets,
                                         proc_data_bucket, mode, user_id,
                                         job_id)
    upload_key = make_s3_key_path(user_id,
                                  job_id,
                                  mode,
                                  course=None,
                                  filename=result_file)
    upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key)
    os.remove(result_file)
    send_email_alert(aws_access_key_id,
                     aws_secret_access_key,
                     job_id,
                     user_id,
                     status=mode,
                     emailaddr_to=email_to)
    return
Пример #13
0
def run_morf_job(job_config, no_cache=False, no_morf_cache=False):
    """
    Wrapper function to run complete MORF job.
    :param job_config: MorfJobConfig object
    :param no_cache: boolean, indicator whether docker_image should be cached in s3
    :param no_morf_cache: boolean, indicator for whether to cache morf data locally
    :return:
    """
    combined_config_filename = "config.properties"
    logger = set_logger_handlers(module_logger, job_config)
    logger.info("running job id: {}".format(job_config.morf_id))
    controller_script_name = "controller.py"
    docker_image_name = "docker_image"
    s3 = job_config.initialize_s3()
    # create temporary directory in local_working_directory from server.config
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        # copy config file into new directory
        shutil.copy(combined_config_filename, working_dir)
        os.chdir(working_dir)
        # from job_config, fetch and download the following: docker image, controller script, cached config file
        if not no_morf_cache:
            update_raw_data_cache(job_config)
        # from client.config, fetch and download the following: docker image, controller script
        try:
            fetch_file(s3,
                       working_dir,
                       job_config.docker_url,
                       dest_filename=docker_image_name,
                       job_config=job_config)
            fetch_file(s3,
                       working_dir,
                       job_config.controller_url,
                       dest_filename=controller_script_name,
                       job_config=job_config)
            if not no_cache:  # cache job files in s3 unless no_cache parameter set to true
                cache_job_file_in_s3(job_config, filename=docker_image_name)
                cache_job_file_in_s3(job_config,
                                     filename=controller_script_name)
        except KeyError as e:
            cause = e.args[0]
            logger.error(
                "[Error]: field {} missing from client.config file.".format(
                    cause))
            sys.exit(-1)
        # change working directory and run controller script with notifications for initialization and completion
        job_config.update_status("INITIALIZED")
        send_email_alert(job_config)
        subprocess.call("python3 {}".format(controller_script_name),
                        shell=True)
        job_config.update_status("SUCCESS")
        # push image to docker cloud, create doi for job files in zenodo, and send success email
        docker_cloud_path = cache_to_docker_hub(job_config, working_dir,
                                                docker_image_name)
        setattr(job_config, "docker_cloud_path", docker_cloud_path)
        zenodo_deposition_id = upload_files_to_zenodo(
            job_config,
            upload_files=(job_config.controller_url,
                          job_config.client_config_url))
        setattr(job_config, "zenodo_deposition_id", zenodo_deposition_id)
        send_success_email(job_config)
        return
Пример #14
0
def extract_holdout_session(labels=False,
                            raw_data_dir="morf-data/",
                            label_type="labels-train",
                            multithread=True):
    """
    Extract features using the Docker image across each session of holdout data.
    :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv).
    :return: None
    """
    mode = "extract-holdout"
    level = "session"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # call job_runner once per session with --mode=extract-holdout and --level=session
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("[INFO] processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                holdout_session = fetch_sessions(
                    job_config,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                poolres = pool.apply_async(run_image, [
                    job_config, raw_data_bucket, course, holdout_session, level
                ])
                reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    if not labels:  # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket
        result_file = collect_session_results(job_config, holdout=True)
        upload_key = "{}/{}/{}/{}".format(job_config.user_id,
                                          job_config.job_id, job_config.mode,
                                          result_file)
        upload_file_to_s3(result_file,
                          bucket=job_config.proc_data_bucket,
                          key=upload_key)
    else:  # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket
        for raw_data_bucket in job_config.raw_data_buckets:
            result_file = collect_session_results(
                job_config, raw_data_buckets=[raw_data_bucket])
            upload_key = raw_data_dir + "{}.csv".format(label_type)
            upload_file_to_s3(result_file,
                              bucket=raw_data_bucket,
                              key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #15
0
def extract_session(labels=False,
                    raw_data_dir="morf-data/",
                    label_type="labels-train",
                    multithread=True):
    """
    Extract features using the Docker image, building individual feature sets for each "session" or iteration of the course.
    :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-train.csv).
    :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket.
    :label_type: type of outcome label to use (string).
    :multithread: whether to run job in parallel (multithread = false can be useful for debugging).
    :return:
    """
    level = "session"
    mode = "extract"
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # # clear any preexisting data for this user/job/mode and set number of cores
    clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    ## for each bucket, call job_runner once per session with --mode=extract and --level=session
    for raw_data_bucket in job_config.raw_data_buckets:
        logger.info("processing bucket {}".format(raw_data_bucket))
        courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir)
        reslist = []
        with Pool(num_cores) as pool:
            for course in courses:
                for session in fetch_sessions(
                        job_config,
                        raw_data_bucket,
                        raw_data_dir,
                        course,
                        fetch_holdout_session_only=False):
                    poolres = pool.apply_async(
                        run_image,
                        [job_config, raw_data_bucket, course, session, level])
                    reslist.append(poolres)
            pool.close()
            pool.join()
        for res in reslist:
            logger.info(res.get())
    if not labels:  # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket
        result_file = collect_session_results(job_config)
        upload_key = "{}/{}/extract/{}".format(job_config.user_id,
                                               job_config.job_id, result_file)
        upload_file_to_s3(result_file,
                          bucket=job_config.proc_data_bucket,
                          key=upload_key)
    else:  # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket
        for raw_data_bucket in job_config.raw_data_buckets:
            result_file = collect_session_results(
                job_config, raw_data_buckets=[raw_data_bucket])
            upload_key = raw_data_dir + "{}.csv".format(label_type)
            upload_file_to_s3(result_file,
                              bucket=raw_data_bucket,
                              key=upload_key)
    os.remove(result_file)
    send_email_alert(job_config)
    return
Пример #16
0
def extract_holdout_session(labels=False,
                            raw_data_dir="morf-data/",
                            label_type="labels-train",
                            multithread=True):
    """
    Extract features using the Docker image across each session of holdout data.
    :labels: flag for whether this is a job to generate output labels; if so, the collected result file is copied back into the raw data folder in s3 (as labels-test.csv).
    :return: None
    """
    mode = "extract-holdout"
    # call job_runner once per session with --mode=extract-holdout and --level=session
    raw_data_buckets = fetch_data_buckets_from_config()
    # clear any preexisting data for this user/job/mode
    clear_s3_subdirectory(proc_data_bucket, user_id, job_id, mode)
    for raw_data_bucket in raw_data_buckets:
        print("[INFO] processing bucket {}".format(raw_data_bucket))
        if multithread:
            with Pool() as pool:
                for course in fetch_courses(s3, raw_data_bucket, raw_data_dir):
                    holdout_run = fetch_sessions(
                        s3,
                        raw_data_bucket,
                        raw_data_dir,
                        course,
                        fetch_holdout_session_only=True)[
                            0]  # only use holdout run; unlisted
                    pool.apply_async(run_job, [
                        docker_url, mode, course, user_id, job_id, holdout_run,
                        "session", raw_data_bucket
                    ])
                pool.close()
                pool.join()
        else:  # do job in serial; this is useful for debugging
            for course in fetch_courses(s3, raw_data_bucket, raw_data_dir):
                holdout_run = fetch_sessions(
                    s3,
                    raw_data_bucket,
                    raw_data_dir,
                    course,
                    fetch_holdout_session_only=True)[
                        0]  # only use holdout run; unlisted
                run_job(docker_url, mode, course, user_id, job_id, holdout_run,
                        "session", raw_data_bucket)
    if not labels:  # normal feature extraction job; collects features across all buckets and upload to proc_data_bucket
        result_file = collect_session_results(s3,
                                              raw_data_buckets,
                                              proc_data_bucket,
                                              mode,
                                              user_id,
                                              job_id,
                                              holdout=True)
        upload_key = "{}/{}/{}/{}".format(user_id, job_id, mode, result_file)
        upload_file_to_s3(result_file, bucket=proc_data_bucket, key=upload_key)
    if labels:  # label extraction job; copy file into raw course data dir instead of proc_data_bucket, creating separate label files for each bucket
        for raw_data_bucket in raw_data_buckets:
            result_file = collect_session_results(s3, [raw_data_bucket],
                                                  proc_data_bucket,
                                                  mode,
                                                  user_id,
                                                  job_id,
                                                  holdout=True)
            upload_key = raw_data_dir + "{}.csv".format(label_type)
            upload_file_to_s3(result_file,
                              bucket=raw_data_bucket,
                              key=upload_key)
    os.remove(result_file)
    send_email_alert(aws_access_key_id,
                     aws_secret_access_key,
                     job_id,
                     user_id,
                     status=mode,
                     emailaddr_to=email_to)
    return