Exemplo n.º 1
0
    def maybe_cache_model(self, model, train_data, test_data):
        if not (model.cacheable and self.use_cache):
            return

        key_cols = list(self.get_id_cols())
        cache_df = self.get_cache_df(model)

        id_row = self.construct_id_row(model, train_data, test_data)
        key = tuple(id_row.iloc[0])

        h = str(uuid.uuid4())
        model_file = models_experiment_config.get_hash_pkl_file(
            model.get_model_name(), h)
        h_fname = global_config.get_save_path(model_file)
        with open(h_fname, 'wb') as f:
            pickle.dump(model, f)

        id_row['hash'] = h
        id_row = id_row.set_index(key_cols)

        if key in cache_df.index:
            cache_df.loc[key] = id_row.iloc[0]
        else:
            cache_df = cache_df.append(id_row)

        cache_csv_path = global_config.get_save_path(
            self.get_model_csv_cache(model.get_model_name()))
        cache_df.to_csv(cache_csv_path)
    def perform(self,
                grade=main_config.single_grade,
                train_years=main_config.train_years,
                test_years=main_config.test_years,
                *args,
                **kwargs):
        train_cohort = Cohort(grade, train_years)
        test_cohort = Cohort(grade, test_years)

        df = pd.DataFrame()
        for model in self.models:
            if not (isinstance(model, SklearnModel)
                    or isinstance(model, TFKerasModel)):
                continue

            feature_proc = model.get_feature_processor
            train_data, test_data = \
                self.get_train_test_data(train_cohort, feature_proc, test_cohort)

            model.train(train_data, test_data)

            model_name = model.get_model_name()
            file_name = model_name + '_importances.png'
            save_path = global_config.get_save_path(file_name)

            feature_names, result, sorted_idxs_full = self.get_feature_importances(
                model, train_data)
            sorted_idxs = sorted_idxs_full[-config.top_n_features:]

            fig, ax1 = plt.subplots()
            ax1.boxplot(result.importances[sorted_idxs].T,
                        vert=False,
                        labels=feature_names[sorted_idxs])
            ax1.set_title('Top Features for {}'.format(model_name))
            ax1.set_xlabel('Importance')

            fig.tight_layout()
            fig.savefig(save_path, facecolor='w')

            cur_df = pd.DataFrame({
                'feature_name':
                feature_names[sorted_idxs_full[::-1]],
                'importance_score':
                result.importances_mean[sorted_idxs_full[::-1]],
            })
            cur_df['model'] = model.get_model_name()
            df = pd.concat([df, cur_df], ignore_index=True)

        return df
Exemplo n.º 3
0
 def __init__(self,
              name='ignore',
              model_types=main_config.model_types,
              get_algorithm=main_config.get_sherpa_algorithm,
              criteria='test precision using top 5.0%',
              features_list=main_config.features,
              labels=main_config.labels,
              metrics=main_config.metrics,
              use_multi_dataset=True,
              lower_is_better=False,
              use_cache=main_config.use_cache):
     super(HPTuningExperiment, self).__init__(name, features_list, labels)
     self.model_types = model_types
     self.get_algorithm = get_algorithm
     self.criteria = criteria
     self.metrics = metrics
     self.use_multi_dataset = use_multi_dataset
     self.out_csv = global_config.get_save_path(config.out_csv,
                                                use_user_time=True)
     self.out_img = global_config.get_save_path(config.out_img,
                                                use_user_time=True)
     self.metrics_df = pd.DataFrame()
     self.lower_is_better = lower_is_better
     self.use_cache = use_cache
Exemplo n.º 4
0
    def maybe_get_cached_model(self, model, train_data, test_data):
        if main_config.overwrite_cache or (not self.use_cache):
            return None

        id_row = self.construct_id_row(model, train_data, test_data)
        key = tuple(id_row.iloc[0])
        cache_df = self.get_cache_df(model)

        if key not in cache_df.index:
            return None

        h = cache_df.loc[key].hash
        model_file = models_experiment_config.get_hash_pkl_file(
            model.get_model_name(), h)
        h_fname = global_config.get_save_path(model_file)

        with open(h_fname, 'rb') as f:
            model = pickle.load(f)

        return model
Exemplo n.º 5
0
    def explore(self):
        students_info = self.query(common_queries.get_student_data([9, 12]))

        # self.bivariate_df.discipline_incidents_rate = students_info.discipline_incidents_rate.fillna(0)
        self.bivariate_df.absenteeism_rate = students_info.absenteeism_rate
        self.bivariate_df.final_gpa = students_info.gpas.apply(
            lambda x: x if x is None else x[-1]).astype(float)
        self.bivariate_df.academic_invs = students_info.inv_groups.apply(
            lambda x: 0 if x is None else x.count('academic_inv'))
        # self.bivariate_df.extracurr_invs = students_info.inv_groups.apply(
        #                                     lambda x: 0 if x is None else x.count('atheletics') + x.count('extracurr_program')
        #                                 )

        self.bivariate_df.label = students_info.label

        self.bivariate_df.dropna(inplace=True)

        plt.rcParams['axes.labelsize'] = 13
        fig = sns.pairplot(self.bivariate_df, kind='hist', height=3)
        path = global_config.get_save_path(config.pairplot_save_file)
        fig.savefig(path, bbox_inches='tight')
        return fig
    def train(self, train_dataset, val_dataset):
        assert val_dataset is not None, 'val_dataset is needed to perform early stopping'
        X_train, y_train = self.get_xy(train_dataset)
        X_val, y_val     = self.get_xy(val_dataset)

        if not self.compiled:
            self.compile_model(X_train)

        log_dir = global_config.get_save_path(tfkeras_model_config.tensorboard_log_dir, use_user_time=True)

        callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=tfkeras_model_config.patience, restore_best_weights=True),
            tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
        ]

        self.core_model.fit(
            x=X_train,
            y=y_train,
            epochs=self.hps.epochs,
            batch_size=self.hps.batch_size,
            validation_data=(X_val, y_val),
            callbacks=callbacks
        )
Exemplo n.º 7
0
 def __init__(self, file_name=config.save_file):
     super(PRkCurveMetrics, self).__init__()
     self.save_path = global_config.get_save_path(file_name)
Exemplo n.º 8
0
 def get_cache_df(self, model):
     cache_csv_path = global_config.get_save_path(
         self.get_model_csv_cache(model.get_model_name()))
     key_cols = list(self.get_id_cols())
     return pd.read_csv(cache_csv_path).set_index(
         key_cols) if os.path.exists(cache_csv_path) else pd.DataFrame()