def update_preprocessing_state(): for modeling_set in modeling_sets: status = { "modelId" : modeling_set["modelId"], "state": "RUNNING", "startTime": start, "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status)
def update_modeling_state(): status = utils.make_running_traininfo( modeling_set["run_folder"], start, modeling_set["listener"]) utils.write_model_status(modeling_set, status)
def update_one_preprocessing_state(modeling_set): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status)
def train_clustering_models_nosave( split_desc, preprocessing_set): """Regular (mode 1) train: - Non streamed single split + fit preprocess on train + preprocess test - Fit N models sequentially - Fit - Save clf - Compute and save clf performance - Score, save scored test set + scored performnace """ start = unix_time_millis() preprocessing_listener = ProgressListener() preprocessing_params = preprocessing_set["preprocessing_params"] modeling_sets = preprocessing_set["modelingSets"] # Fill all the listeners ASAP to have correct progress data preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES) for modeling_set in modeling_sets: listener = ProgressListener() listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES) modeling_set["listener"] = listener # Called by the preprocessing pipeline to update the state # of each model and dump it to disk def update_preprocessing_state(): for modeling_set in modeling_sets: status = { "modelId" : modeling_set["modelId"], "state": "RUNNING", "startTime": start, "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) logging.info("START TRAIN :" + preprocessing_set["description"]) preprocessing_params = preprocessing_set["preprocessing_params"] with preprocessing_listener.push_state(constants.STATE_LOADING_SRC): update_preprocessing_state() source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) with preprocessing_listener.push_state(constants.STATE_COLLECTING): update_preprocessing_state() collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_set["preprocessing_params"], preprocessing_set["run_folder"]) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC): update_preprocessing_state() source_df_index = source_df.index.copy() # TODO: fit_and_process should take an update_fn argument transformed_source = pipeline.fit_and_process(source_df) # Saves fitted resources and collector data preproc_handler.save_data() # Report on work report = {} pipeline.report_fit(report, {}) utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report) update_preprocessing_state() preprocessing_end = unix_time_millis() for modeling_set in modeling_sets: model_start = unix_time_millis() def update_modeling_state(): status = utils.make_running_traininfo(modeling_set["run_folder"], start, (preprocessing_listener, modeling_set["listener"])) utils.write_model_status(modeling_set, status) clustering_train_score_save(transformed_source, source_df_index, preprocessing_set["preprocessing_params"], modeling_set["modelingParams"], modeling_set["run_folder"], listener, update_modeling_state, pipeline) model_end = end = unix_time_millis() end = unix_time_millis() # Write the final model training info status = { "modelId": modeling_set["modelId"], "state": "DONE", "startTime": start, "endTime": end, "preprocessingTime": preprocessing_end - start, "trainingTime": model_end - model_start, "progress": merge_listeners(preprocessing_listener, modeling_set["listener"]) } utils.write_model_status(modeling_set, status) return "ok"