示例#1
0
 def update_preprocessing_state():
     for modeling_set in modeling_sets:
         status = {
             "modelId" : modeling_set["modelId"],
             "state": "RUNNING",
             "startTime": start,
             "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"])
         }
         utils.write_model_status(modeling_set, status)
示例#2
0
 def update_modeling_state():
     status = utils.make_running_traininfo(
         modeling_set["run_folder"], start,
         modeling_set["listener"])
     utils.write_model_status(modeling_set, status)
示例#3
0
 def update_one_preprocessing_state(modeling_set):
     status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                           (preprocessing_listener, modeling_set["listener"]))
     utils.write_model_status(modeling_set, status)
示例#4
0
def train_clustering_models_nosave(
                            split_desc,
                            preprocessing_set):
    """Regular (mode 1) train:
      - Non streamed single split + fit preprocess on train + preprocess test
      - Fit N models sequentially
         - Fit
         - Save clf
         - Compute and save clf performance
         - Score, save scored test set + scored performnace
    """

    start = unix_time_millis()
    preprocessing_listener = ProgressListener()
    preprocessing_params = preprocessing_set["preprocessing_params"]
    modeling_sets = preprocessing_set["modelingSets"]

    # Fill all the listeners ASAP to have correct progress data
    preprocessing_listener.add_future_steps(constants.CLUSTERING_REGULAR_PREPROCESSING_STATES)
    for modeling_set in modeling_sets:
        listener = ProgressListener()
        listener.add_future_steps(constants.ALL_CLUSTERING_TRAIN_STATES)
        modeling_set["listener"] = listener

    # Called by the preprocessing pipeline to update the state
    # of each model and dump it to disk
    def update_preprocessing_state():
        for modeling_set in modeling_sets:
            status = {
                "modelId" : modeling_set["modelId"],
                "state": "RUNNING",
                "startTime": start,
                "progress" : merge_listeners(preprocessing_listener, modeling_set["listener"])
            }
            utils.write_model_status(modeling_set, status)

    logging.info("START TRAIN :" + preprocessing_set["description"])
    preprocessing_params = preprocessing_set["preprocessing_params"]

    with preprocessing_listener.push_state(constants.STATE_LOADING_SRC):
        update_preprocessing_state()
        source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"])

        logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape)

    with preprocessing_listener.push_state(constants.STATE_COLLECTING):
        update_preprocessing_state()
        collector = ClusteringPreprocessingDataCollector(source_df, preprocessing_params)
        collector_data = collector.build()

    preproc_handler = ClusteringPreprocessingHandler({},
                        preprocessing_set["preprocessing_params"],
                        preprocessing_set["run_folder"])

    preproc_handler.collector_data = collector_data
    pipeline = preproc_handler.build_preprocessing_pipeline()

    with preprocessing_listener.push_state(constants.STATE_PREPROCESS_SRC):
        update_preprocessing_state()
        source_df_index = source_df.index.copy()
        # TODO: fit_and_process should take an update_fn argument
        transformed_source = pipeline.fit_and_process(source_df)
        # Saves fitted resources and collector data
        preproc_handler.save_data()
        # Report on work
        report = {}
        pipeline.report_fit(report, {})
        utils.write_preproc_file(preprocessing_set["run_folder"], "preprocessing_report.json", report)

    update_preprocessing_state()

    preprocessing_end = unix_time_millis()

    for modeling_set in modeling_sets:
        model_start = unix_time_millis()
        def update_modeling_state():
            status = utils.make_running_traininfo(modeling_set["run_folder"], start,
                                                  (preprocessing_listener, modeling_set["listener"]))
            utils.write_model_status(modeling_set, status)
        clustering_train_score_save(transformed_source, source_df_index,
                                    preprocessing_set["preprocessing_params"],
                                    modeling_set["modelingParams"],
                                    modeling_set["run_folder"],
                                    listener,
                                    update_modeling_state,
                                    pipeline)

        model_end = end = unix_time_millis()
        end = unix_time_millis()

        # Write the final model training info
        status = {
            "modelId": modeling_set["modelId"],
            "state": "DONE",
            "startTime": start,
            "endTime": end,
            "preprocessingTime": preprocessing_end - start,
            "trainingTime": model_end - model_start,
            "progress": merge_listeners(preprocessing_listener, modeling_set["listener"])
        }
        utils.write_model_status(modeling_set, status)

    return "ok"