Exemplo n.º 1
0
def cross_validate_session(label_type,
                           k=5,
                           multithread=True,
                           raw_data_dir="morf-data/"):
    """
    Compute k-fold cross-validation across sessions.
    :return:
    """
    raise NotImplementedError  # this is not implemented!
    # todo: call to create_session_folds() goes here
    job_config = MorfJobConfig(CONFIG_FILENAME)
    job_config.update_mode(mode)
    logger = set_logger_handlers(module_logger, job_config)
    # clear any preexisting data for this user/job/mode
    # clear_s3_subdirectory(job_config)
    if multithread:
        num_cores = job_config.max_num_cores
    else:
        num_cores = 1
    logger.info("conducting cross validation")
    with Pool(num_cores) as pool:
        for raw_data_bucket in job_config.raw_data_buckets:
            for course in fetch_complete_courses(job_config, raw_data_bucket):
                for session in fetch_sessions(job_config,
                                              raw_data_bucket,
                                              data_dir=raw_data_dir,
                                              course=course,
                                              fetch_all_sessions=True):
                    for fold_num in range(1, k + 1):
                        with tempfile.TemporaryDirectory(
                                dir=job_config.local_working_directory
                        ) as working_dir:
                            # get fold train data
                            input_dir, output_dir = initialize_input_output_dirs(
                                working_dir)
                            session_input_dir = os.path.join(
                                input_dir, course, session)
                            session_output_dir = os.path.join(
                                output_dir, course, session)
                            trainkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "train"),
                                session)
                            train_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                trainkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            testkey = make_s3_key_path(
                                job_config, course,
                                make_feature_csv_name(course, session,
                                                      fold_num, "test"),
                                session)
                            test_data_path = download_from_s3(
                                job_config.proc_data_bucket,
                                testkey,
                                job_config.initialize_s3(),
                                dir=session_input_dir,
                                job_config=job_config)
                            # get labels
                            initialize_labels(job_config, raw_data_bucket,
                                              course, session, label_type,
                                              session_input_dir, raw_data_dir)
                            # run docker image with mode == cv
                            #todo
                            # upload results
                            #todo
        pool.close()
        pool.join()
    return
Exemplo n.º 2
0
def send_success_email(job_config, emailaddr_from="*****@*****.**"):
    """
    Send an email alert with an attachment.
    Modified substantially from:
    http://blog.vero4ka.info/blog/2016/10/26/how-to-send-an-email-with-attachment-via-amazon-ses-in-python/
    https://gist.github.com/yosemitebandit/2883593
    :param job_config: MorfJobConfig object.
    :param emailaddr_from: address to send email from (string).
    :return:
    """
    aws_access_key_id = job_config.aws_access_key_id
    aws_secret_access_key = job_config.aws_secret_access_key
    proc_data_bucket = job_config.proc_data_bucket
    job_id = job_config.job_id
    user_id = job_config.user_id
    emailaddr_to = job_config.email_to
    status = job_config.status
    job_config.update_mode(
        "test"
    )  # need to set mode so that correct key path is used to fetch results
    results_file_name = "morf-results.csv"
    s3 = boto3.client("s3",
                      aws_access_key_id=aws_access_key_id,
                      aws_secret_access_key=aws_secret_access_key)
    # fetch model evaluation results
    attachment_basename = generate_archive_filename(job_config,
                                                    mode="evaluate",
                                                    extension="csv")
    key = make_s3_key_path(job_config, filename=attachment_basename)
    attachment_filepath = download_from_s3(proc_data_bucket, key, s3)
    with open(attachment_filepath) as f:
        data = f.read()
    output = io.StringIO(data)
    # Build an email
    subject_text = construct_message_subject(job_config)
    msg = MIMEMultipart()
    msg["Subject"] = subject_text
    msg["From"] = emailaddr_from
    msg["To"] = emailaddr_to
    # What a recipient sees if they don't use an email reader
    msg.preamble = "Multipart message.\n"
    # the body
    body_text = construct_message_body(job_config)
    body = MIMEText(body_text)
    msg.attach(body)
    # The attachment
    part = MIMEApplication(output.getvalue())
    part.add_header("Content-Disposition",
                    "attachment",
                    filename=results_file_name)
    part.add_header("Content-Type", "application/vnd.ms-excel; charset=UTF-8")
    msg.attach(part)
    # Connect to Amazon SES
    ses = boto3.client(
        "ses",
        region_name="us-east-1",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    # And finally, send the email
    try:
        ses.send_raw_email(Source=emailaddr_from,
                           Destinations=[emailaddr_to, emailaddr_from],
                           RawMessage={
                               'Data': msg.as_string(),
                           })
        print("[INFO] email notification sent emailaddr_to {}".format(
            emailaddr_to))
    except Exception as e:
        print("[WARNING] error sending email to {}: {}".format(
            emailaddr_to, e))
    return
Exemplo n.º 3
0
def execute_image_for_cv(job_config,
                         raw_data_bucket,
                         course,
                         fold_num,
                         docker_image_dir,
                         label_type,
                         raw_data_dir="morf-data/"):
    """

    :param job_config:
    :param raw_data_bucket:
    :param course:
    :param fold_num:
    :param docker_image_dir:
    :param label_type:
    :param raw_data_dir:
    :return:
    """
    user_id_col = "userID"
    logger = set_logger_handlers(module_logger, job_config)
    with tempfile.TemporaryDirectory(
            dir=job_config.local_working_directory) as working_dir:
        input_dir, output_dir = initialize_input_output_dirs(working_dir)
        # get fold train data
        course_input_dir = os.path.join(input_dir, course)
        trainkey = make_s3_key_path(
            job_config, course, make_feature_csv_name(course, fold_num,
                                                      "train"))
        train_data_path = download_from_s3(job_config.proc_data_bucket,
                                           trainkey,
                                           job_config.initialize_s3(),
                                           dir=course_input_dir,
                                           job_config=job_config)
        testkey = make_s3_key_path(
            job_config, course, make_feature_csv_name(course, fold_num,
                                                      "test"))
        test_data_path = download_from_s3(job_config.proc_data_bucket,
                                          testkey,
                                          job_config.initialize_s3(),
                                          dir=course_input_dir,
                                          job_config=job_config)
        # get labels
        train_users = pd.read_csv(train_data_path)[user_id_col]
        train_labels_path = initialize_cv_labels(job_config,
                                                 train_users,
                                                 raw_data_bucket,
                                                 course,
                                                 label_type,
                                                 input_dir,
                                                 raw_data_dir,
                                                 fold_num,
                                                 "train",
                                                 level="course")
        # run docker image with mode == cv
        image_uuid = load_docker_image(docker_image_dir, job_config, logger)
        cmd = make_docker_run_command(
            job_config, job_config.docker_exec, input_dir, output_dir,
            image_uuid, course, None, mode,
            job_config.client_args) + " --fold_num {}".format(fold_num)
        execute_and_log_output(cmd, logger)
        # upload results
        pred_csv = os.path.join(output_dir,
                                "{}_{}_test.csv".format(course, fold_num))
        pred_key = make_s3_key_path(job_config,
                                    course,
                                    os.path.basename(pred_csv),
                                    mode="test")
        upload_file_to_s3(pred_csv,
                          job_config.proc_data_bucket,
                          pred_key,
                          job_config,
                          remove_on_success=True)
    return