def fetch_result_file(job_config, dir, course=None, session=None): """ Download and untar result file for user_id, job_id, mode, and (optional) course and session from job_config.proc_data_bucket. :param job_config: MorfJobConfig object. :param course: course shorname. :param session: session number. :return: None. """ logger = set_logger_handlers(module_logger, job_config) s3 = job_config.initialize_s3() bucket = job_config.proc_data_bucket archive_file = generate_archive_filename(job_config, course, session) key = make_s3_key_path(job_config, course=course, session=session, filename=archive_file) dest = os.path.join(dir, archive_file) logger.info("fetching s3://{}/{}".format(bucket, key)) with open(dest, 'wb') as resource: try: s3.download_fileobj(bucket, key, resource) except Exception as e: logger.warning( "exception while fetching results for mode {} course {} session {}:{}" .format(job_config.mode, course, session, e)) unarchive_file(dest, dir) os.remove(dest) return
def make_output_archive_file(output_dir, job_config, course=None, session=None): """ Archive output_dir into archive file, and return name of archive file. :param output_dir: directory to compress into archive_file. :param mode: mode for job (string); one of: {extract, test, train}. :param user_id: user_id for job (string). :param job_id: job_id for job (string). :param course: course: name of course for job (string). :param session: session number of course (string) (optional, only needed when mode == extract). :return: name of archive file (string). """ logger = set_logger_handlers(module_logger, job_config) archive_file = generate_archive_filename(job_config, course, session) # archive results; only save directory structure relative to output_dir (NOT absolute directory structure) logger.info(" archiving results to {} as {}".format( output_dir, archive_file)) # todo: use python tarfile here cmd = "tar -cvf {} -C {} .".format(archive_file, output_dir) subprocess.call(cmd, shell=True, stdout=open(os.devnull, "wb"), stderr=open(os.devnull, "wb")) return archive_file
def sync_s3_bucket_cache(job_config, bucket): """ Cache all data in an s3 bucket to job_config.cache_dir, creating a complete copy of files and directory structure. :param job_config: MorfJobConfig object. :param bucket: path to s3 bucket. :return: """ logger = set_logger_handlers(module_logger, job_config) s3bucket = "s3://{}".format(bucket) bucket_cache_dir = os.path.join(job_config.cache_dir, bucket) # create job_config.cache_dir directory if not exists if not os.path.exists(job_config.cache_dir): try: os.makedirs(job_config.cache_dir) except exception as e: logger.error("error creating cache: {}".format(e)) raise # execute s3 sync command cmd = "{} s3 sync {} {}".format(job_config.aws_exec, s3bucket, bucket_cache_dir) logger.info("running {}".format(cmd)) try: subprocess.call(cmd, shell=True) except Exception as e: logger.warning("exception when executing sync: {}".format(e)) return
def create_course_folds(label_type, k=5, multithread=True): """ From extract and extract-holdout data, create k randomized folds, pooling data by course (across sessions) and archive results to s3. :param label_type: type of outcome label to use. :param k: number of folds. :param multithread: logical indicating whether multiple cores should be used (if available) :param raw_data_dir: name of subfolder in s3 buckets containing raw data. :return: """ job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("creating cross-validation folds") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): poolres = pool.apply_async( make_folds, [job_config, raw_data_bucket, course, k, label_type]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) return
def check_dataframe_complete(df, job_config, columns): """ Check columns for presence of NaN values; if any NaN values exist, throw message and raise exception. :param df: pd.DataFrame, containing columns. :param columns: columns to check for NaN values. :return: """ logger = set_logger_handlers(module_logger, job_config) logger.info("[INFO] checking predictions") # filter to only include complete courses courses = [ x[0] for x in fetch_all_complete_courses_and_sessions(job_config) ] df_to_check = df[df.course.isin(courses)] null_counts = df_to_check[columns].apply(lambda x: sum(x.isnull()), axis=0) if null_counts.sum() > 0: logger.error( "Null values detected in the following columns: {} \n Did you include predicted probabilities and labels for all users?" .format(null_counts.loc[null_counts > 0].index.tolist())) missing_courses = df_to_check[ df_to_check.prob.isnull()]['course'].unique() logger.error("missing values detected in these courses: {}".format( missing_courses)) raise else: return
def sync_s3_job_cache(job_config, modes=("extract", "extract-holdout", "train", "test")): """ Sync data in s3 just for this specific job (better for large buckets or when the entire bucket is not actually needed). :param job_config: :param bucket: :param modes: modes to update cache for; by default to all modes :return: """ bucket = job_config.proc_data_bucket logger = set_logger_handlers(module_logger, job_config) s3bucket = "s3://{}".format(bucket) bucket_cache_dir = os.path.join(job_config.cache_dir, bucket) # create job_config.cache_dir directory if not exists if not os.path.exists(job_config.cache_dir): try: os.makedirs(job_config.cache_dir) except exception as e: logger.error("error creating cache: {}".format(e)) raise for m in modes: s3_prefix = make_s3_key_path(job_config, mode=m) mode_cache_dir = os.path.join(bucket_cache_dir, job_config.user_id, job_config.job_id, m) # execute s3 sync command cmd = "{} s3 sync {}/{} {}".format(job_config.aws_exec, s3bucket, s3_prefix, mode_cache_dir) logger.info("running {}".format(cmd)) try: subprocess.call(cmd, shell=True) except Exception as e: logger.warning("exception when executing sync: {}".format(e)) return
def upload_files_to_zenodo(job_config, upload_files, deposition_id = None, publish = True): """ Upload each file in files to Zenodo, and publish the repo. :param deposition_id: :param files: a tuple of filenames to upload. These should be locally available. :param access_token: :return: deposition_id of Zenodo files """ working_dir = os.getcwd() s3 = job_config.initialize_s3() logger = set_logger_handlers(module_logger, job_config) access_token = getattr(job_config, "zenodo_access_token") # check inputs assert isinstance(upload_files, collections.Iterable), "param 'files' must be an iterable" if not deposition_id: # create an empty upload and get its deposition id deposition_id = create_empty_zenodo_upload(access_token).json()['id'] # upload each file for f in upload_files: fp = fetch_file(s3, working_dir, f, job_config=job_config) data = {'filename': fp} files = {'file': open(fp, 'rb')} r = requests.post('https://zenodo.org/api/deposit/depositions/%s/files' % deposition_id, params = {'access_token': access_token}, data = data, files = files) logger.info(r.json()) # generate metadata for the zenodo repo and publish it generate_zenodo_metadata(job_config, deposition_id) if publish: publish_zenodo_deposition(job_config, deposition_id) return deposition_id
def fetch_binary_classification_metrics(job_config, df, course, pred_prob_col="prob", pred_col="pred", label_col="label_value", course_col="course"): """ Fetch set of binary classification metrics for df. :param job_config: MorfJobConfig object. :param df: pd.DataFrame of predictions; must include columns with names matching pred_prob_col, pred_col, and label_col. :param pred_prob_col: column of predicted probability of a positive class label. Should be in interval [0,1]. :param pred_col: column of predicted class label. Should be in {0, 1}. :param label_col: column of true class label. Should be in {0, 1} :return: pd.DataFrame with dimension [1 x n_metrics]. """ logger = set_logger_handlers(module_logger, job_config) logger.info("fetching metrics for course {}".format(course)) df = df[df[course_col] == course] metrics = {} y_pred = df[pred_col].values.astype(float) y_true = df[label_col].values.astype(float) y_score = df[pred_prob_col].values metrics["accuracy"] = sklearn.metrics.accuracy_score(y_true, y_pred) try: metrics["auc"] = sklearn.metrics.roc_auc_score(y_true, y_score) metrics["log_loss"] = sklearn.metrics.log_loss(y_true, y_score) metrics["precision"] = sklearn.metrics.precision_score(y_true, y_pred) # metrics["recall"] = sklearn.metrics.recall_score( y_true, y_pred) # true positive rate, sensitivity metrics["f1_score"] = sklearn.metrics.f1_score(y_true, y_pred) except ValueError: logger.warning( "Only one class present in y_true for course {}. ROC AUC score, log_loss, precision, recall, F1 are undefined." .format(course)) metrics["auc"] = np.nan metrics["log_loss"] = np.nan metrics["precision"] = np.nan metrics["recall"] = np.nan metrics["f1_score"] = np.nan metrics["cohen_kappa_score"] = sklearn.metrics.cohen_kappa_score( y_true, y_pred) metrics["N"] = df.shape[0] metrics["N_n"] = df[label_col].value_counts().get(0, 0) metrics["N_p"] = df[label_col].value_counts().get(1, 0) cm = sklearn.metrics.confusion_matrix(y_true, y_pred) try: spec = cm[0, 0] / float(cm[0, 0] + cm[1, 0]) except Exception as e: print( "[ERROR] error when computing specificity from confusion matrix: {}" .format(e)) print("confusion matrix is: {}".format(cm)) spec = np.nan metrics["specificity"] = spec metrics_df = pd.DataFrame(metrics, index=[course]) return metrics_df
def download_train_test_data(job_config, raw_data_bucket, raw_data_dir, course, session, input_dir, label_type): """ Download pre-extracted train or test data (specified by mode) for course/session into input_dir. :param job_config: MorfJobConfig object. :param raw_data_bucket: bucket containing raw data. :param raw_data_dir: directory in raw_data_bucket containing course-level data. :param course: course to fetch data for. :param session: session to fetch data for. :param input_dir: /input directory to load data into. This should be same directory mounted to Docker image. :param label_type: valid label type to reatin for 'label' column of MORF-provided labels. :return: None """ logger = set_logger_handlers(module_logger, job_config) s3 = job_config.initialize_s3() aws_access_key_id = job_config.aws_access_key_id aws_secret_access_key = job_config.aws_secret_access_key proc_data_bucket = job_config.proc_data_bucket mode = job_config.mode user_id = job_config.user_id job_id = job_config.job_id if mode == "train": fetch_mode = "extract" if mode == "test": fetch_mode = "extract-holdout" logger.info(" fetching {} data for course {} session {}".format( fetch_mode, course, session)) session_input_dir = os.path.join(input_dir, course, session) os.makedirs(session_input_dir) # download features file feature_csv = generate_archive_filename(job_config, mode=fetch_mode, extension="csv") key = "{}/{}/{}/{}".format(user_id, job_id, fetch_mode, feature_csv) download_from_s3(proc_data_bucket, key, s3, session_input_dir) # read features file and filter to only include specific course/session local_feature_csv = os.path.join(session_input_dir, feature_csv) temp_df = pd.read_csv(local_feature_csv, dtype=object) outfile = os.path.join(session_input_dir, "{}_{}_features.csv".format(course, session)) temp_df[(temp_df["course"] == course) & (temp_df["session"] == session)].drop(["course", "session"], axis = 1)\ .to_csv(outfile, index = False) os.remove(local_feature_csv) if mode == "train": #download labels only if training job; otherwise no labels needed initialize_labels(s3, aws_access_key_id, aws_secret_access_key, raw_data_bucket, course, session, mode, label_type, dest_dir=session_input_dir, data_dir=raw_data_dir) return
def docker_cloud_login(job_config): """ Log into docker cloud using creds in job_config. :param job_config: MorfJobConfig object. :return: None """ cmd = "docker login --username={} --password={}".format( job_config.docker_cloud_username, job_config.docker_cloud_password) logger = set_logger_handlers(module_logger, job_config) execute_and_log_output(cmd, logger) return
def cache_to_docker_hub(job_config, dir, image_name): """ Push image to MORF repo in Docker Hub. :param job_config: MorfJobConfig object. :return: None """ logger = set_logger_handlers(module_logger, job_config) image_uuid = load_docker_image(dir, job_config, logger, image_name) docker_cloud_login(job_config) docker_cloud_repo_and_tag_path = docker_cloud_push(job_config, image_uuid) return docker_cloud_repo_and_tag_path
def run_morf_job(job_config, no_cache=False): """ Wrapper function to run complete MORF job. :param client_config_url: url to client.config file. :param server_config_url: url to server.config file. :return: """ combined_config_filename = "config.properties" logger = set_logger_handlers(module_logger, job_config) logger.info("running job id: {}".format(job_config.morf_id)) controller_script_name = "controller.py" docker_image_name = "docker_image" s3 = job_config.initialize_s3() # create temporary directory in local_working_directory from server.config with tempfile.TemporaryDirectory( dir=job_config.local_working_directory) as working_dir: # copy config file into new directory shutil.copy(combined_config_filename, working_dir) os.chdir(working_dir) # from job_config, fetch and download the following: docker image, controller script, cached config file update_morf_job_cache(job_config) # from client.config, fetch and download the following: docker image, controller script try: fetch_file(s3, working_dir, job_config.docker_url, dest_filename=docker_image_name, job_config=job_config) fetch_file(s3, working_dir, job_config.controller_url, dest_filename=controller_script_name, job_config=job_config) if not no_cache: # cache job files in s3 unless no_cache parameter set to true cache_job_file_in_s3(job_config, filename=docker_image_name) cache_job_file_in_s3(job_config, filename=controller_script_name) except KeyError as e: cause = e.args[0] logger.error( "[Error]: field {} missing from client.config file.".format( cause)) sys.exit(-1) # change working directory and run controller script with notifications for initialization and completion job_config.update_status("INITIALIZED") send_email_alert(job_config) subprocess.call("python3 {}".format(controller_script_name), shell=True) job_config.update_status("SUCCESS") send_success_email(job_config) return
def collect_session_results(job_config, holdout=False, raw_data_dir="morf-data/", raw_data_buckets=None): """ Iterate through course- and session-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv. :param s3: boto3.client object with appropriate access credentials. :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket. :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories. :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features). :param mode: mode to collect results for, {extract, test}. :param holdout: flag; fetch holdout run only (boolean; default False). :return: path to csv. """ logger = set_logger_handlers(module_logger, job_config) mode = job_config.mode if not raw_data_buckets: # can utilize this parameter to override job_config buckets; used for label extraction raw_data_buckets = job_config.raw_data_buckets feat_df_list = list() for raw_data_bucket in raw_data_buckets: for course in fetch_courses(job_config, raw_data_bucket, raw_data_dir): for run in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=holdout): with tempfile.TemporaryDirectory( dir=os.getcwd()) as working_dir: logger.info( "[INFO] fetching extraction results for course {} run {}" .format(course, run)) try: fetch_result_file(job_config, course=course, session=run, dir=working_dir) csv = fetch_result_csv_fp(working_dir) feat_df = pd.read_csv(csv, dtype=object) feat_df['course'] = course feat_df['session'] = run feat_df_list.append(feat_df) except Exception as e: logger.warning( "exception while collecting session results for course {} session {} mode {}: {}" .format(course, run, mode, e)) continue master_feat_df = pd.concat(feat_df_list) csv_fp = generate_archive_filename(job_config, extension='csv') master_feat_df.to_csv(csv_fp, index=False, header=True) return csv_fp
def collect_course_cv_results(job_config, k=5, raw_data_dir="morf-data/"): """ Iterate through course-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv. :param s3: boto3.client object with appropriate access credentials. :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket. :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories. :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features). :param mode: mode to collect results for, {extract, test}. :param holdout: flag; fetch holdout run only (boolean; default False). :return: path to csv. """ logger = set_logger_handlers(module_logger, job_config) raw_data_buckets = job_config.raw_data_buckets mode = job_config.mode pred_df_list = list() session = None for raw_data_bucket in raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: for fold_num in range(1, k + 1): logger.info( "fetching {} results for course {} session {}".format( mode, course, session)) try: fold_csv_name = "{}_{}_test.csv".format( course, fold_num) key = make_s3_key_path(job_config, course, fold_csv_name, mode="test") pred_fp = download_from_s3(job_config.proc_data_bucket, key, job_config.initialize_s3(), working_dir, dest_filename=fold_csv_name) pred_df = pd.read_csv(pred_fp, dtype=object) pred_df['course'] = course pred_df['fold_num'] = str(fold_num) pred_df_list.append(pred_df) except Exception as e: logger.warning("exception occurred: {} ".format(e)) continue master_feat_df = pd.concat(pred_df_list) csv_fp = generate_archive_filename(job_config, mode="test", extension='csv') master_feat_df.to_csv(csv_fp, index=False, header=True) return csv_fp
def evaluate_prule_session(): """ Perform statistical testing for prule analysis. :return: None """ raw_data_dir = "morf-data/" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) raw_data_buckets = job_config.raw_data_buckets proc_data_bucket = job_config.proc_data_bucket prule_file = job_config.prule_url s3 = job_config.initialize_s3() # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: input_dir, output_dir = initialize_input_output_dirs(working_dir) # pull extraction results from every course into working_dir for raw_data_bucket in raw_data_buckets: for course in fetch_courses(job_config, raw_data_bucket): for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_all_sessions=True): if session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course): ## session is a non-holdout session fetch_mode = "extract" else: fetch_mode = "extract-holdout" feat_file = generate_archive_filename(job_config, course=course, session=session, mode=fetch_mode) feat_key = make_s3_key_path(job_config, filename=feat_file, course=course, session=session, mode=fetch_mode) feat_local_fp = download_from_s3(proc_data_bucket, feat_key, s3, input_dir, job_config=job_config) unarchive_file(feat_local_fp, input_dir) docker_image_fp = urlparse(job_config.prule_evaluate_image).path docker_image_dir = os.path.dirname(docker_image_fp) docker_image_name = os.path.basename(docker_image_fp) image_uuid = load_docker_image(docker_image_dir, job_config, logger, image_name=docker_image_name) # create a directory for prule file and copy into it; this will be mounted to docker image prule_dir = os.path.join(working_dir, "prule") os.makedirs(prule_dir) shutil.copy(urlparse(prule_file).path, prule_dir) cmd = "{} run --network=\"none\" --rm=true --volume={}:/input --volume={}:/output --volume={}:/prule {} ".format(job_config.docker_exec, input_dir, output_dir, prule_dir, image_uuid) subprocess.call(cmd, shell=True) # rename result file and upload results to s3 final_output_file = os.path.join(output_dir, "output.csv") final_output_archive_name = generate_archive_filename(job_config, extension="csv") final_output_archive_fp = os.path.join(output_dir, final_output_archive_name) os.rename(final_output_file, final_output_archive_fp) output_key = make_s3_key_path(job_config, filename = final_output_archive_name, mode = "test") upload_file_to_s3(final_output_archive_fp, proc_data_bucket, output_key, job_config, remove_on_success=True) return
def extract_holdout_course(raw_data_dir="morf-data/", multithread=True): """ Extract features using the Docker image across each course of holdout data. :return: """ mode = "extract-holdout" level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # call job_runner once percourse with --mode=extract and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: holdout_session = fetch_sessions( job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[ 0] # only use holdout run; unlisted poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, holdout_session, level, None ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=result_file) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def test_course(label_type, raw_data_dir="morf-data/", multithread=True): """ tests one model per course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: """ level = "course" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 ## for each bucket, call job_runner once per course with --mode=test and --level=course for raw_data_bucket in job_config.raw_data_buckets: logger.info("[INFO] processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, None, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) result_file = collect_course_results(job_config) upload_key = make_s3_key_path(job_config, filename=generate_archive_filename( job_config, extension="csv")) upload_file_to_s3(result_file, bucket=job_config.proc_data_bucket, key=upload_key) os.remove(result_file) send_email_alert(job_config) return
def fetch_raw_course_data(job_config, bucket, course, session, input_dir, data_dir="morf-data/"): """ Fetch raw course data from job_config.cache_dir, if exists; otherwise fetch from s3. :param job_config: MorfJobConfig object :param bucket: bucket containing raw data. :param course: id of course to download data for. :param session: id of session to download data for. :param input_dir: input directory. :param data_dir: directory in bucket that contains course-level data. :return: None """ logger = set_logger_handlers(module_logger, job_config) course_date_file = "coursera_course_dates.csv" course_session_cache_dir = os.path.join(job_config.cache_dir, bucket, data_dir, course, session) session_input_dir = os.path.join(input_dir, course, session) if job_config.cache_dir: try: logger.info("copying data from cached location {} to {}".format( course_session_cache_dir, session_input_dir)) shutil.copytree(course_session_cache_dir, session_input_dir) course_date_file = os.path.join(job_config.cache_dir, bucket, data_dir, course_date_file) shutil.copy(course_date_file, session_input_dir) except Exception as e: logger.error( "exception while attempting to copy from cache: {}".format(e)) else: download_raw_course_data(job_config, bucket=raw_data_bucket, course=course, session=session, input_dir=input_dir, data_dir=data_dir) # unzip all of the sql files and remove any parens from filename for item in os.listdir(session_input_dir): if item.endswith(".sql.gz"): item_path = os.path.join(session_input_dir, item) unarchive_res = unarchive_file(item_path, session_input_dir) clean_filename(unarchive_res) return
def collect_course_results(job_config, raw_data_dir="morf-data/"): """ Iterate through course-level directories in bucket, download individual files from [mode], add column for course and session, and concatenate into single 'master' csv. :param s3: boto3.client object with appropriate access credentials. :param raw_data_buckets: list of buckets containing raw data; used to fetch course names from each bucket. :param raw_data_dir: path to directory in raw_data_bucket containing course-level directories. :param proc_data_bucket: bucket containing session-level archived results from [mode] jobs (i.e., session-level extracted features). :param mode: mode to collect results for, {extract, test}. :param holdout: flag; fetch holdout run only (boolean; default False). :return: path to csv. """ logger = set_logger_handlers(module_logger, job_config) raw_data_buckets = job_config.raw_data_buckets mode = job_config.mode feat_df_list = list() for raw_data_bucket in raw_data_buckets: for course in fetch_complete_courses(job_config, raw_data_bucket): if mode == "extract-holdout": # results are stored in session-level directories in extract-holdout mode; get this session session = fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course, fetch_holdout_session_only=True)[0] else: session = None with tempfile.TemporaryDirectory(dir=os.getcwd()) as working_dir: logger.info( "fetching {} results for course {} session {}".format( mode, course, session)) try: fetch_result_file(job_config, dir=working_dir, course=course, session=session) csv = fetch_result_csv_fp(working_dir) feat_df = pd.read_csv(csv, dtype=object) feat_df['course'] = course feat_df_list.append(feat_df) except Exception as e: logger.warning("exception occurred: {} ".format(e)) continue master_feat_df = pd.concat(feat_df_list) csv_fp = generate_archive_filename(job_config, extension='csv') master_feat_df.to_csv(csv_fp, index=False, header=True) return csv_fp
def docker_cloud_push(job_config, image_uuid): """ Push image to Docker Cloud repo in job_config; tagging the image with its morf_id. :param job_config: MorfJobConfig object :param image_uuid: Docker image uuid :return: None """ logger = set_logger_handlers(module_logger, job_config) docker_cloud_repo_and_tag_path = "{}:{}".format( job_config.docker_cloud_repo, job_config.morf_id) # tag the docker image using the morf_id tag_cmd = "docker tag {} {}".format(image_uuid, docker_cloud_repo_and_tag_path) execute_and_log_output(tag_cmd, logger) # push the image to docker cloud push_cmd = "docker push {}".format(docker_cloud_repo_and_tag_path) execute_and_log_output(push_cmd, logger) return docker_cloud_repo_and_tag_path
def clear_s3_subdirectory(job_config, course=None, session=None): """ Clear all files for user_id, job_id, and mode; used to wipe s3 subdirectory before uploading new files. :job_config: MorfJobConfig object. :param course: :param session: :return: """ logger = set_logger_handlers(module_logger, job_config) s3_prefix = "/".join([ x for x in [ job_config.proc_data_bucket, job_config.user_id, job_config.job_id, job_config.mode, course, session ] if x is not None ]) + "/" logger.info(" clearing previous job data at s3://{}".format(s3_prefix)) delete_s3_keys(job_config, prefix=s3_prefix) return
def download_models(job_config, course, dest_dir, level, session=None): """ Download and untar archived file of pre-trained models for specified user_id/job_id/course. :param job_config: MorfJobConfig object. :param course: course: course slug for job (string). :param dest_dir: location to download models to; this should be /input directory mounted to Docker image. :param level: Level for job. :param session: Session id for session-level jobs. :return: None """ logger = set_logger_handlers(module_logger, job_config) bucket = job_config.proc_data_bucket user_id = job_config.user_id aws_access_key_id = job_config.aws_access_key_id aws_secret_access_key = job_config.aws_secret_access_key job_id = job_config.job_id if level == "all": # just one model file mod_archive_file = generate_archive_filename(job_config, mode="train") key = make_s3_key_path(job_config, mode="train", filename=mod_archive_file) download_model_from_s3(job_config, bucket, key, dest_dir) elif level in [ "course", "session" ]: # model files might be in either course- or session-level directories train_files = [ obj.key for obj in boto3.resource( "s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key).Bucket(bucket). objects.filter(Prefix="/".join([user_id, job_id, "train"])) if ".tgz" in obj.key.split("/")[-1] # fetch trained model files only and "train" in obj.key.split("/")[-1] and course in obj.key.split("/")[-1] ] for key in train_files: download_model_from_s3(job_config, bucket, key, dest_dir) else: logger.error( "the procedure for executing this job is unsupported in this version of MORF." ) raise return
def download_model_from_s3(job_config, bucket, key, dest_dir): """ Download and untar a model file from S3; or print a warning message if it doesn't exist. :return: """ logger = set_logger_handlers(module_logger, job_config) s3 = job_config.initialize_s3() mod_url = 's3://{}/{}'.format(bucket, key) logger.info( " downloading compressed model file from bucket {} key {}".format( bucket, key)) try: tar_path = initialize_tar(mod_url, s3=s3, dest_dir=dest_dir) unarchive_file(tar_path, dest_dir) except: logger.error( "error downloading model file from s3; trained model(s) for this course may not exist. Skipping." ) return
def download_raw_course_data( job_config, bucket, course, session, input_dir, data_dir, course_date_file_name="coursera_course_dates.csv"): """ Download all raw course files for course and session into input_dir. :param job_config: MorfJobConfig object. :param bucket: bucket containing raw data. :param course: id of course to download data for. :param session: id of session to download data for. :param input_dir: input directory. :param data_dir: directory in bucket that contains course-level data. :param course_date_file_name: name of csv file in bucket which contains course start/end dates. :return: None """ s3 = job_config.initialize_s3() logger = set_logger_handlers(module_logger, job_config) course_date_file_url = "s3://{}/{}/{}".format(bucket, data_dir, course_date_file_name) session_input_dir = os.path.join(input_dir, course, session) os.makedirs(session_input_dir) for obj in boto3.resource("s3", aws_access_key_id=job_config.aws_access_key_id, aws_secret_access_key=job_config.aws_secret_access_key)\ .Bucket(bucket).objects.filter(Prefix="{}/{}/{}/".format(data_dir, course, session)): filename = obj.key.split("/")[-1] filename = re.sub('[\s\(\)":!&]', "", filename) filepath = os.path.join(session_input_dir, filename) try: with open(filepath, "wb") as resource: s3.download_fileobj(bucket, obj.key, resource) except: logger.warning("skipping empty object in bucket {} key {}".format( bucket, obj.key)) continue dates_bucket = get_bucket_from_url(course_date_file_url) dates_key = get_key_from_url(course_date_file_url) dates_file = dates_key.split("/")[-1] s3.download_file(dates_bucket, dates_key, os.path.join(session_input_dir, dates_file)) return
def cross_validate_course(label_type, k=5, multithread=True): """ Compute k-fold cross-validation across courses. :return: """ # todo: call to create_course_folds() goes here job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) # clear previous test results clear_s3_subdirectory(job_config, mode="test") docker_image_dir = os.getcwd( ) # directory the function is called from; should contain docker image logger = set_logger_handlers(module_logger, job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 logger.info("conducting cross validation") for raw_data_bucket in job_config.raw_data_buckets: reslist = [] with Pool(num_cores) as pool: for course in fetch_complete_courses(job_config, raw_data_bucket): for fold_num in range(1, k + 1): poolres = pool.apply_async(execute_image_for_cv, [ job_config, raw_data_bucket, course, fold_num, docker_image_dir, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) test_csv_fp = collect_course_cv_results(job_config) pred_key = make_s3_key_path(job_config, os.path.basename(test_csv_fp), mode="test") upload_file_to_s3(test_csv_fp, job_config.proc_data_bucket, pred_key, job_config, remove_on_success=True) return
def fetch_file(s3, dest_dir, remote_file_url, dest_filename=None, job_config=None): """ :param s3: boto3.client object for s3 connection. :param dest_dir: directory to download file to (string). :param remote_file_url: url of remote file; must be either file://, s3, or http format (string). :param dest_filename: base name of file to use (otherwise defaults to current file name) (string). :param job_config: MorfJobConfig object; used for logging. :return: """ logger = set_logger_handlers(module_logger, job_config) logger.info("retrieving file {} to {}".format(remote_file_url, dest_dir)) try: if not dest_filename: dest_filename = os.path.basename(remote_file_url) url = urlparse(remote_file_url) if url.scheme == "file": shutil.copyfile(url.path, os.path.join(dest_dir, dest_filename)) elif url.scheme == "s3": bucket = url.netloc key = url.path[1:] # ignore initial / download_from_s3(bucket, key, s3, dest_dir, dest_filename=dest_filename) elif url.scheme == "https": urllib.request.urlretrieve(remote_file_url, os.path.join(dest_dir, dest_filename)) else: logger.error( "A URL which was not s3:// or file:// or https:// was passed in for a file location, this is not supported. {}" .format(remote_file_url)) sys.exit(-1) except Exception as e: logger.error("{} when attempting to fetch and copy file at {}".format( e, remote_file_url)) return
def upload_file_to_s3(file, bucket, key, job_config=None): """ Upload file to bucket + key in S3. :param file: name or path to file. :param bucket: bucket to upload to. :param key: key to upload to in bucket. :param job_config: MorfJobConfig object; used for logging. :return: None """ logger = set_logger_handlers(module_logger, job_config) session = boto3.Session() s3_client = session.client("s3") tc = boto3.s3.transfer.TransferConfig() t = boto3.s3.transfer.S3Transfer(client=s3_client, config=tc) logger.info("uploading {} to s3://{}/{}".format(file, bucket, key)) try: t.upload_file(file, bucket, key) except Exception as e: logger.warn("error caching configurations: {}".format(e)) return
def train_session(label_type, raw_data_dir="morf-data/", multithread=True): """ Train one model per session of the course using the Docker image. :param label_type: label type provided by user. :raw_data_dir: path to directory in all data buckets where course-level directories are located; this should be uniform for every raw data bucket. :multithread: whether to run job in parallel (multithread = false can be useful for debugging). :return: None """ level = "session" job_config = MorfJobConfig(CONFIG_FILENAME) job_config.update_mode(mode) logger = set_logger_handlers(module_logger, job_config) check_label_type(label_type) # clear any preexisting data for this user/job/mode clear_s3_subdirectory(job_config) if multithread: num_cores = job_config.max_num_cores else: num_cores = 1 # for each bucket, call job_runner once per session with --mode=train and --level=session for raw_data_bucket in job_config.raw_data_buckets: logger.info("processing bucket {}".format(raw_data_bucket)) courses = fetch_complete_courses(job_config, raw_data_bucket, raw_data_dir) reslist = [] with Pool(num_cores) as pool: for course in courses: for session in fetch_sessions(job_config, raw_data_bucket, raw_data_dir, course): poolres = pool.apply_async(run_image, [ job_config, raw_data_bucket, course, session, level, label_type ]) reslist.append(poolres) pool.close() pool.join() for res in reslist: logger.info(res.get()) send_email_alert(job_config) return
def generate_zenodo_metadata(job_config, deposition_id): """ Create metadata for a MORF job. :param job_config: :param deposition_id: :return: """ logger = set_logger_handlers(module_logger, job_config) data = { 'metadata': { 'title': 'MORF job id {}'.format(job_config.morf_id), 'upload_type': 'software', 'description': 'Job files for job id {} from the MOOC Replication Framework'.format(job_config.morf_id), 'creators': [{'name': '{}'.format(job_config.user_id), 'affiliation': 'None'}] } } headers = {"Content-Type": "application/json"} r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % deposition_id, params = {'access_token': getattr(job_config, "zenodo_access_token")}, data = json.dumps(data), headers = headers) logger.info(r.json()) return
def fetch_from_cache(job_config, cache_file_path, dest_dir): """ Fetch a file from the cache for job_config into dest_dir, if it exists. :param job_config: :param cache_file_path: string, relative path to file in cache (this is identical to the directory path in s3; e.g. "/bucket/path/to/somefile.csv" :param dest_dir: absolute path of directory to fetch file into (will be created if not exists) :return: path to fetched file (string); return None if cache is not used. """ logger = set_logger_handlers(module_logger, job_config) logger.info("fetching file {} from cache".format(cache_file_path)) abs_cache_file_path = os.path.join(getattr(job_config, "cache_dir", None), cache_file_path) if hasattr(job_config, "cache_dir") and os.path.exists(abs_cache_file_path): if not os.path.exists(dest_dir): os.makedirs(dest_dir) dest_fp = shutil.copy(abs_cache_file_path, dest_dir) else: logger.warning( "file {} does not exist in cache".format(abs_cache_file_path)) dest_fp = None return dest_fp