Exemplo n.º 1
0
def test_initialize_tracking_and_get_run_id(db_engine_with_results_schema):
    experiment = ExperimentFactory()
    factory_session.commit()
    experiment_hash = experiment.experiment_hash
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash=experiment_hash,
        experiment_class_path="mymodule.MyClassName",
        random_seed=1234,
        experiment_kwargs={"key": "value"},
        db_engine=db_engine_with_results_schema,
    )
    assert run_id
    with scoped_session(db_engine_with_results_schema) as session:
        experiment_run = session.query(TriageRun).get(run_id)
        assert experiment_run.run_hash == experiment_hash
        assert experiment_run.experiment_class_path == "mymodule.MyClassName"
        assert experiment_run.random_seed == 1234
        assert experiment_run.experiment_kwargs == {"key": "value"}
    new_run_id = initialize_tracking_and_get_run_id(
        experiment_hash=experiment_hash,
        experiment_class_path="mymodule.MyClassName",
        random_seed=5432,
        experiment_kwargs={"key": "value"},
        db_engine=db_engine_with_results_schema,
    )
    assert new_run_id > run_id
Exemplo n.º 2
0
    def _needs_ranks(self, model_id, matrix_uuid, matrix_type):
        if self.replace:
            logger.info("Replace flag set, will compute and store ranks regardless")
            return True
        with scoped_session(self.db_engine) as session:
            # if the metadata is different (e.g. they changed the rank order)
            # or there are any null ranks we need to rank
            metadata_matches = session.query(session.query(matrix_type.prediction_metadata_obj).filter_by(
                model_id=model_id,
                matrix_uuid=matrix_uuid,
                tiebreaker_ordering=self.rank_order,
            ).exists()).scalar()
            if not metadata_matches:
                logger.debug("Prediction metadata does not match what is in configuration"
                              ", will compute and store ranks")
                return True

            any_nulls_in_ranks = session.query(session.query(matrix_type.prediction_obj)\
                .filter(
                    matrix_type.prediction_obj.model_id == model_id,
                    matrix_type.prediction_obj.matrix_uuid == matrix_uuid,
                    or_(
                        matrix_type.prediction_obj.rank_abs_no_ties == None,
                        matrix_type.prediction_obj.rank_abs_with_ties == None,
                        matrix_type.prediction_obj.rank_pct_no_ties == None,
                        matrix_type.prediction_obj.rank_pct_with_ties == None,
                    )
                ).exists()).scalar()
            if any_nulls_in_ranks:
                logger.debug("At least one null in rankings in predictions table",
                              ", will compute and store ranks")
                return True
        logger.debug("No need to recompute prediction ranks")
        return False
Exemplo n.º 3
0
def test_increment_field(db_engine_with_results_schema):
    experiment_run = ExperimentRunFactory()
    factory_session.commit()
    increment_field('matrices_made', experiment_run.run_id, db_engine_with_results_schema)
    increment_field('matrices_made', experiment_run.run_id, db_engine_with_results_schema)

    with scoped_session(db_engine_with_results_schema) as session:
        experiment_run_from_db = session.query(ExperimentRun).get(experiment_run.run_id)
        assert experiment_run_from_db.matrices_made == 2
Exemplo n.º 4
0
def test_get_run_for_update(db_engine_with_results_schema):
    experiment_run = TriageRunFactory()
    factory_session.commit()
    with get_run_for_update(db_engine=db_engine_with_results_schema,
                            run_id=experiment_run.run_id) as run_obj:
        run_obj.stacktrace = "My stacktrace"

    with scoped_session(db_engine_with_results_schema) as session:
        experiment_run_from_db = session.query(TriageRun).get(
            experiment_run.run_id)
        assert experiment_run_from_db.stacktrace == "My stacktrace"
Exemplo n.º 5
0
def test_experiment_tracker_in_parts(test_engine, project_path):
    experiment = SingleThreadedExperiment(
        config=sample_config(),
        db_engine=test_engine,
        project_path=project_path,
    )
    experiment.generate_matrices()
    experiment.train_and_test_models()
    with scoped_session(test_engine) as session:
        experiment_run = session.query(ExperimentRun).get(experiment.run_id)
        assert experiment_run.start_method == "generate_matrices"
Exemplo n.º 6
0
def test_experiment_tracker_in_parts(test_engine, project_path):
    with mock.patch("triage.util.conf.open",
                    side_effect=open_side_effect) as mock_file:
        experiment = SingleThreadedExperiment(
            config=sample_config(),
            db_engine=test_engine,
            project_path=project_path,
        )
    experiment.generate_matrices()
    experiment.train_and_test_models()
    with scoped_session(test_engine) as session:
        experiment_run = session.query(TriageRun).get(experiment.run_id)
        assert experiment_run.start_method == "generate_matrices"
Exemplo n.º 7
0
def test_experiment_tracker_exception(db_engine, project_path):
    experiment = SingleThreadedExperiment(
        config=sample_config(),
        db_engine=db_engine,
        project_path=project_path,
    )
    # no source data means this should blow up
    with pytest.raises(Exception):
        experiment.run()

    with scoped_session(db_engine) as session:
        experiment_run = session.query(ExperimentRun).get(experiment.run_id)
        assert experiment_run.current_status == ExperimentRunStatus.failed
        assert isinstance(experiment_run.last_updated_time, datetime.datetime)
        assert experiment_run.stacktrace
Exemplo n.º 8
0
    def _write_to_db(
        self,
        model_id,
        subset_hash,
        evaluation_start_time,
        evaluation_end_time,
        as_of_date_frequency,
        matrix_uuid,
        evaluations,
        evaluation_table_obj,
    ):
        """Write evaluation objects to the database
        Binds the model_id as as_of_date to the given ORM objects
        and writes them to the database
        Args:
            model_id (int) primary key of the model
            subset_hash (str) the hash of the subset, if any, that the
                evaluation is made on
            evaluation_start_time (pandas._libs.tslibs.timestamps.Timestamp)
                first as_of_date included in the evaluation period
            evaluation_end_time (pandas._libs.tslibs.timestamps.Timestamp) last
                as_of_date included in the evaluation period
            as_of_date_frequency (str) the frequency with which as_of_dates
                occur between the evaluation_start_time and evaluation_end_time
            evaluations (list) results_schema.TestEvaluation or TrainEvaluation
                objects
            evaluation_table_obj (schema.TestEvaluation or TrainEvaluation)
                specifies to which table to add the evaluations
        """
        with scoped_session(self.db_engine) as session:
            session.query(evaluation_table_obj).filter_by(
                model_id=model_id,
                evaluation_start_time=evaluation_start_time,
                evaluation_end_time=evaluation_end_time,
                as_of_date_frequency=as_of_date_frequency,
                subset_hash=subset_hash
            ).delete()

            for evaluation in evaluations:
                evaluation.model_id = model_id
                evaluation.as_of_date_frequency = as_of_date_frequency
                evaluation.subset_hash = subset_hash
                evaluation.evaluation_start_time = evaluation_start_time
                evaluation.evaluation_end_time = evaluation_end_time
                evaluation.as_of_date_frequency = as_of_date_frequency
                evaluation.matrix_uuid = matrix_uuid
                evaluation.subset_hash = subset_hash
                session.add(evaluation)
Exemplo n.º 9
0
def test_experiment_tracker_exception(db_engine, project_path):
    with mock.patch("triage.util.conf.open",
                    side_effect=open_side_effect) as mock_file:
        experiment = SingleThreadedExperiment(
            config=sample_config(),
            db_engine=db_engine,
            project_path=project_path,
        )
    # no source data means this should blow up
    with pytest.raises(Exception):
        experiment.run()

    with scoped_session(db_engine) as session:
        experiment_run = session.query(TriageRun).get(experiment.run_id)
        assert experiment_run.current_status == TriageRunStatus.failed
        assert isinstance(experiment_run.last_updated_time, datetime.datetime)
        assert experiment_run.stacktrace
Exemplo n.º 10
0
def initialize_tracking_and_get_run_id(experiment_hash, experiment_class_path,
                                       random_seed, experiment_kwargs,
                                       db_engine):
    """Create a row in the TriageRun table with some initial info and return the created run_id

    Args:
        experiment_hash (str) An experiment hash that exists in the experiments table
        experiment_class_path (str) The name of the experiment subclass used
        random_seed (int) Random seed used to run the experiment
        experiment_kwargs (dict) Any runtime Experiment keyword arguments that should be saved
        db_engine (sqlalchemy.engine)
    """
    # Any experiment kwargs that are types (e.g. MatrixStorageClass) can't
    # be serialized, so just use the class name if so
    cleaned_experiment_kwargs = {
        k: (classpath(v) if isinstance(v, type) else v)
        for k, v in experiment_kwargs.items()
    }
    run = TriageRun(
        start_time=datetime.datetime.now(),
        git_hash=infer_git_hash(),
        triage_version=infer_triage_version(),
        python_version=infer_python_version(),
        run_type="experiment",
        run_hash=experiment_hash,
        last_updated_time=datetime.datetime.now(),
        current_status=TriageRunStatus.started,
        installed_libraries=infer_installed_libraries(),
        platform=platform.platform(),
        os_user=getpass.getuser(),
        working_directory=os.getcwd(),
        ec2_instance_type=infer_ec2_instance_type(),
        log_location=infer_log_location(),
        experiment_class_path=experiment_class_path,
        random_seed=random_seed,
        experiment_kwargs=cleaned_experiment_kwargs,
    )
    run_id = None
    with scoped_session(db_engine) as session:
        session.add(run)
        session.commit()
        run_id = run.run_id
    if not run_id:
        raise ValueError("Failed to retrieve run_id from saved row")
    return run_id
Exemplo n.º 11
0
def increment_field(field, run_id, db_engine):
    """Increment an ExperimentRun's named field.

    Expects that the field is an integer in the database.

    Will also kick the last_updated_time timestamp.

    Args:
        field (str) The name of the field
        run_id (int) The identifier/primary key of the run
        db_engine (sqlalchemy.engine)
    """
    with scoped_session(db_engine) as session:
        # Use an update query instead of a session merge so it happens in one atomic query
        # and protect against race conditions
        session.query(ExperimentRun).filter_by(run_id=run_id).update({
            field:
            getattr(ExperimentRun, field) + 1,
            'last_updated_time':
            datetime.datetime.now()
        })
Exemplo n.º 12
0
def test_initialize_tracking_and_get_run_id(db_engine_with_results_schema):
    experiment = ExperimentFactory()
    factory_session.commit()
    experiment_hash = experiment.experiment_hash
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash=experiment_hash,
        experiment_class_path='mymodule.MyClassName',
        experiment_kwargs={'key': 'value'},
        db_engine=db_engine_with_results_schema
    )
    assert run_id
    with scoped_session(db_engine_with_results_schema) as session:
        experiment_run = session.query(ExperimentRun).get(run_id)
        assert experiment_run.experiment_hash == experiment_hash
        assert experiment_run.experiment_class_path == 'mymodule.MyClassName'
        assert experiment_run.experiment_kwargs == {'key': 'value'}
    new_run_id = initialize_tracking_and_get_run_id(
        experiment_hash=experiment_hash,
        experiment_class_path='mymodule.MyClassName',
        experiment_kwargs={'key': 'value'},
        db_engine=db_engine_with_results_schema
    )
    assert new_run_id > run_id
Exemplo n.º 13
0
    def _write_audit_to_db(self, model_id, protected_df, predictions_proba,
                           labels, tie_breaker, subset_hash, matrix_type,
                           evaluation_start_time, evaluation_end_time,
                           matrix_uuid):
        """
        Runs the bias audit and saves the result in the bias table.

        Args:
            model_id (int) primary key of the model
            protected_df (pandas.DataFrame) A dataframe with protected group attributes:
            predictions_proba (np.array) List of prediction probabilities
            labels (pandas.Series): List of labels
            tie_breaker: 'best' or 'worst' case tiebreaking rule that the predictions and labels were sorted by
            subset_hash (str) the hash of the subset, if any, that the
                evaluation is made on
            matrix_type (triage.component.catwalk.storage.MatrixType)
                The type of matrix used
            evaluation_start_time (pandas._libs.tslibs.timestamps.Timestamp)
                first as_of_date included in the evaluation period
            evaluation_end_time (pandas._libs.tslibs.timestamps.Timestamp) last
                as_of_date included in the evaluation period
            matrix_uuid: the uuid of the matrix
        Returns:

        """
        if protected_df.empty:
            return

        # to preprocess aequitas requires the following columns:
        # score, label value, model_id, protected attributes
        # fill out the protected_df, which just has protected attributes at this point
        protected_df = protected_df.copy()
        protected_df['model_id'] = model_id
        protected_df['score'] = predictions_proba
        protected_df['label_value'] = labels
        aequitas_df, attr_cols_input = preprocess_input_df(protected_df)

        # create group crosstabs
        g = Group()
        score_thresholds = {}
        score_thresholds['rank_abs'] = self.bias_config['thresholds'].get(
            'top_n', [])
        # convert 0-100 percentile to 0-1 that Aequitas expects
        score_thresholds['rank_pct'] = [
            value / 100.0
            for value in self.bias_config['thresholds'].get('percentiles', [])
        ]
        groups_model, attr_cols = g.get_crosstabs(
            aequitas_df,
            score_thresholds=score_thresholds,
            attr_cols=attr_cols_input)
        # analyze bias from reference groups
        bias = Bias()
        ref_groups_method = self.bias_config.get('ref_groups_method', None)
        if ref_groups_method == 'predefined' and self.bias_config['ref_groups']:
            bias_df = bias.get_disparity_predefined_groups(
                groups_model, aequitas_df, self.bias_config['ref_groups'])
        elif ref_groups_method == 'majority':
            bias_df = bias.get_disparity_major_group(groups_model, aequitas_df)
        else:
            bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df)

        # analyze fairness for each group
        f = Fairness(tau=0.8)  # the default fairness threshold is 0.8
        group_value_df = f.get_group_value_fairness(bias_df)
        group_value_df['subset_hash'] = subset_hash
        group_value_df['tie_breaker'] = tie_breaker
        group_value_df['evaluation_start_time'] = evaluation_start_time
        group_value_df['evaluation_end_time'] = evaluation_end_time
        group_value_df['matrix_uuid'] = matrix_uuid
        group_value_df = group_value_df.rename(
            index=str, columns={"score_threshold": "parameter"})
        if group_value_df.empty:
            raise ValueError(f"""
            Bias audit: aequitas_audit() failed.
            Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash}
            and matrix_type = {matrix_type}""")
        with scoped_session(self.db_engine) as session:
            for index, row in group_value_df.iterrows():
                session.query(matrix_type.aequitas_obj).filter_by(
                    model_id=row['model_id'],
                    evaluation_start_time=row['evaluation_start_time'],
                    evaluation_end_time=row['evaluation_end_time'],
                    subset_hash=row['subset_hash'],
                    parameter=row['parameter'],
                    tie_breaker=row['tie_breaker'],
                    matrix_uuid=row['matrix_uuid'],
                    attribute_name=row['attribute_name'],
                    attribute_value=row['attribute_value']).delete()
            session.bulk_insert_mappings(
                matrix_type.aequitas_obj,
                group_value_df.to_dict(orient="records"))
Exemplo n.º 14
0
    def update_db_with_ranks(self, model_id, matrix_uuid, matrix_type):
        """Update predictions table with rankings, both absolute and percentile.
                random_seed=postgres_random_seed,
        All entities should have different ranks, so to break ties:
        - abs_rank uses the 'row_number' function, so ties are broken by the database ordering
            session.close()
        - pct_rank uses the output of the abs_rank to compute percentiles
          (as opposed to raw scores), so it inherits the tie-breaking from abs_rank
        Args:
            model_id (int) the id of the model associated with the given predictions
            matrix_uuid (string) the uuid of the prediction matrix
        """
        if not self.save_predictions:
            logging.info("save_predictions is set to False so there are no predictions to rank")
            return
        logging.info(
            'Beginning ranking of new Predictions for model %s, matrix %s',
            model_id,
            matrix_uuid
        )

        # retrieve a dataframe with only the data we need to rank
        ranking_df = pandas.DataFrame.pg_copy_from(
            f"""select entity_id, score, as_of_date, label_value
            from {matrix_type.string_name}_results.predictions
            where model_id = {model_id} and matrix_uuid = '{matrix_uuid}'
            """, connectable=self.db_engine)

        sort_seed = None
        if self.rank_order == 'random':
            with scoped_session(self.db_engine) as session:
                sort_seed = session.query(Model).get(model_id).random_seed
                if not sort_seed:
                    sort_seed = generate_python_random_seed()

        sorted_predictions, sorted_labels, sorted_arrays = sort_predictions_and_labels(
            predictions_proba=ranking_df['score'],
            labels=ranking_df['label_value'],
            tiebreaker=self.rank_order,
            sort_seed=sort_seed,
            parallel_arrays=(ranking_df['entity_id'], ranking_df['as_of_date']),
        )
        ranking_df['score'] = sorted_predictions.values
        ranking_df['as_of_date'] = pandas.to_datetime(sorted_arrays[1].values)
        ranking_df['label_value'] = sorted_labels.values
        ranking_df['entity_id'] = sorted_arrays[0].values
        # at this point, we have the same dataframe that we loaded from postgres,
        # but sorted based on score and the self.rank_order.

        # Now we can generate ranks using pandas and only using the 'score' column because
        # our secondary ordering is baked in, enabling the 'first' method to break ties.
        ranking_df['rank_abs_no_ties'] = ranking_df['score'].rank(ascending=False, method='first')
        ranking_df['rank_abs_with_ties'] = ranking_df['score'].rank(ascending=False, method='min')
        ranking_df['rank_pct_no_ties'] = numpy.array([1 - (rank - 1) / len(ranking_df) for rank in ranking_df['rank_abs_no_ties']])
        ranking_df['rank_pct_with_ties'] = ranking_df['score'].rank(method='min', pct=True)

        # with our rankings computed, update these ranks into the existing rows
        # in the predictions table
        temp_table_name = f"ranks_mod{model_id}_mat{matrix_uuid}"
        ranking_df.pg_copy_to(temp_table_name, self.db_engine)
        self.db_engine.execute(f"""update {matrix_type.string_name}_results.predictions as p
            set rank_abs_no_ties = tt.rank_abs_no_ties,
            rank_abs_with_ties = tt.rank_abs_with_ties,
            rank_pct_no_ties = tt.rank_pct_no_ties,
            rank_pct_with_ties = tt.rank_pct_with_ties
            from {temp_table_name} as tt
            where tt.entity_id = p.entity_id
            and p.matrix_uuid = '{matrix_uuid}'
            and p.model_id = {model_id}
            and p.as_of_date = tt.as_of_date
                               """)
        self.db_engine.execute(f"drop table {temp_table_name}")
        self._write_metadata_to_db(
            model_id=model_id,
            matrix_uuid=matrix_uuid,
            matrix_type=matrix_type,
            random_seed=sort_seed,
        )
        logging.info(
            'Completed ranking of new Predictions for model %s, matrix %s',
            model_id,
            matrix_uuid
        )
Exemplo n.º 15
0
    def retrain(self, prediction_date):
        """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan)
        
        Args:
            prediction_date(str) 
        """
        # Retrain config and hash
        retrain_config = {
            "model_group_id": self.model_group_id,
            "prediction_date": prediction_date,
            "test_label_timespan": self.test_label_timespan,
            "test_duration": self.test_duration,
        }
        self.retrain_hash = save_retrain_and_get_hash(retrain_config,
                                                      self.db_engine)

        with get_for_update(self.db_engine, Retrain,
                            self.retrain_hash) as retrain:
            retrain.prediction_date = prediction_date

        # Timechop
        prediction_date = dt_from_str(prediction_date)
        temporal_config = self.get_temporal_config_for_retrain(prediction_date)
        timechopper = Timechop(**temporal_config)
        chops = timechopper.chop_time()
        assert len(chops) == 1
        chops_train_matrix = chops[0]['train_matrix']
        as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'],
                                       "%Y-%m-%d")
        retrain_definition = {
            'first_as_of_time':
            chops_train_matrix['first_as_of_time'],
            'last_as_of_time':
            chops_train_matrix['last_as_of_time'],
            'matrix_info_end_time':
            chops_train_matrix['matrix_info_end_time'],
            'as_of_times': [as_of_date],
            'training_label_timespan':
            chops_train_matrix['training_label_timespan'],
            'max_training_history':
            chops_train_matrix['max_training_history'],
            'training_as_of_date_frequency':
            chops_train_matrix['training_as_of_date_frequency'],
        }

        # Set ExperimentRun
        run = TriageRun(
            start_time=datetime.now(),
            git_hash=infer_git_hash(),
            triage_version=infer_triage_version(),
            python_version=infer_python_version(),
            run_type="retrain",
            run_hash=self.retrain_hash,
            last_updated_time=datetime.now(),
            current_status=TriageRunStatus.started,
            installed_libraries=infer_installed_libraries(),
            platform=platform.platform(),
            os_user=getpass.getuser(),
            working_directory=os.getcwd(),
            ec2_instance_type=infer_ec2_instance_type(),
            log_location=infer_log_location(),
            experiment_class_path=classpath(self.__class__),
            random_seed=retrieve_experiment_seed_from_run_id(
                self.db_engine, self.triage_run_id),
        )
        run_id = None
        with scoped_session(self.db_engine) as session:
            session.add(run)
            session.commit()
            run_id = run.run_id
        if not run_id:
            raise ValueError("Failed to retrieve run_id from saved row")

        # set ModelTrainer's run_id and experiment_hash for Retrain run
        self.model_trainer.run_id = run_id
        self.model_trainer.experiment_hash = self.retrain_hash

        # 1. Generate all labels
        self.generate_all_labels(as_of_date)
        record_labels_table_name(run_id, self.db_engine,
                                 self.labels_table_name)

        # 2. Generate cohort
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain"
        self.generate_entity_date_table(as_of_date, cohort_table_name)
        record_cohort_table_name(run_id, self.db_engine, cohort_table_name)

        # 3. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            as_of_date, cohort_table_name)
        feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks(
            collate_aggregations, task_type='aggregation')
        self.feature_generator.process_table_tasks(
            feature_aggregation_table_tasks)

        # 4. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations,
            self.model_group_info['model_id_last_split'],
        )
        feature_group_creator = FeatureGroupCreator(
            self.experiment_config['feature_group_definition'])
        feature_group_mixer = FeatureGroupMixer(["all"])
        feature_group_dict = feature_group_mixer.generate(
            feature_group_creator.subsets(reconstructed_feature_dict))[0]
        self.feature_generator.process_table_tasks(imputation_table_tasks)
        # 5. Build new matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
            "labels_table_name": self.labels_table_name,
        }

        record_matrix_building_started(run_id, self.db_engine)
        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        new_matrix_metadata = Planner.make_metadata(
            matrix_definition=retrain_definition,
            feature_dictionary=feature_group_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='train',
            feature_start_time=dt_from_str(self.feature_start_time),
            user_metadata=self.user_metadata,
        )

        new_matrix_metadata['matrix_id'] = "_".join([
            self.label_name,
            'binary',
            str(as_of_date),
            'retrain',
        ])

        matrix_uuid = filename_friendly_hash(new_matrix_metadata)
        matrix_builder.build_matrix(
            as_of_times=[as_of_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=feature_group_dict,
            matrix_metadata=new_matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="train",
        )
        retrain_model_comment = 'retrain_' + str(datetime.now())

        misc_db_parameters = {
            'train_end_time': dt_from_str(as_of_date),
            'test': False,
            'train_matrix_uuid': matrix_uuid,
            'training_label_timespan': self.training_label_timespan,
            'model_comment': retrain_model_comment,
        }

        # get the random seed from the last split
        last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id(
            self.db_engine,
            model_id=self.model_group_info['model_id_last_split'])

        random_seed = self.model_trainer.get_or_generate_random_seed(
            model_group_id=self.model_group_id,
            matrix_metadata=last_split_matrix_metadata,
            train_matrix_uuid=last_split_train_matrix_uuid)

        # create retrain model hash
        retrain_model_hash = self.model_trainer._model_hash(
            self.matrix_storage_engine.get_store(matrix_uuid).metadata,
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            random_seed=random_seed,
        )

        associate_models_with_retrain(self.retrain_hash,
                                      (retrain_model_hash, ), self.db_engine)

        record_model_building_started(run_id, self.db_engine)
        retrain_model_id = self.model_trainer.process_train_task(
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            model_hash=retrain_model_hash,
            misc_db_parameters=misc_db_parameters,
            random_seed=random_seed,
            retrain=True,
            model_group_id=self.model_group_id)

        self.retrain_model_hash = retrieve_model_hash_from_id(
            self.db_engine, retrain_model_id)
        self.retrain_matrix_uuid = matrix_uuid
        self.retrain_model_id = retrain_model_id
        return {
            'retrain_model_comment': retrain_model_comment,
            'retrain_model_id': retrain_model_id
        }