예제 #1
0
    def split_train_test(self, conf):
        # shot_list_dir = conf['paths']['shot_list_dir']
        shot_files = conf['paths']['shot_files']
        shot_files_test = conf['paths']['shot_files_test']
        train_frac = conf['training']['train_frac']
        shuffle_training = conf['training']['shuffle_training']
        use_shots = conf['data']['use_shots']
        all_signals = conf['paths']['all_signals']
        # split randomly
        use_shots_train = int(round(train_frac*use_shots))
        use_shots_test = int(round((1-train_frac)*use_shots))
        if len(shot_files_test) == 0:
            shot_list_train, shot_list_test = train_test_split(
                self.shots, train_frac, shuffle_training)
        # train and test list given
        else:
            shot_list_train = ShotList()
            shot_list_train.load_from_shot_list_files_objects(
                shot_files, all_signals)

            shot_list_test = ShotList()
            shot_list_test.load_from_shot_list_files_objects(
                shot_files_test, all_signals)

        shot_numbers_train = [shot.number for shot in shot_list_train]
        shot_numbers_test = [shot.number for shot in shot_list_test]
        print(len(shot_numbers_train), len(shot_numbers_test))
        # make sure we only use pre-filtered valid shots
        shots_train = self.filter_by_number(shot_numbers_train)
        shots_test = self.filter_by_number(shot_numbers_test)
        return shots_train.random_sublist(
            use_shots_train), shots_test.random_sublist(use_shots_test)
예제 #2
0
 def split_direct(self, frac, do_shuffle=True):
     shot_list_one, shot_list_two = train_test_split(
         self.shots, frac, do_shuffle)
     return ShotList(shot_list_one), ShotList(shot_list_two)
예제 #3
0
def main(filename):
    print('loading data')
    # Establish database connection
    with open(
            '/data/groups/schools1/mlpolicylab_fall20_schools1/pipeline/db_info.yaml',
            'r') as f:
        db_params = yaml.safe_load(f)['db']

    engine = create_engine('postgres://:@{host}:{port}/{dbname}'.format(
        host=db_params['host'],
        port=db_params['port'],
        dbname=db_params['dbname'],
    ))
    # Load data from database to dataframe
    df = load_data(filename, engine)

    # Split dataframe into train and test data.
    splits, years_reference = train_test_split(df)

    for i, (train_df, test_df) in enumerate(splits):
        print(f'processing split {i}')

        # Explore data for each of the cohort
        explore_data(train_df)

        # Process train and test data seperately
        updated_df_train = process_data(train_df)
        updated_df_test = process_data(test_df)

        # Upload the test and train data to database for future reference and easy retrival
        updated_df_train.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_train.columns
        ]
        updated_df_test.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_test.columns
        ]

        table_name = timestamp + '_' + str(years_reference[i][1]) + '_' + str(
            years_reference[i][0])

        df_to_db(table_name, 'processed_data', updated_df_train,
                 updated_df_test, engine)

        # Retreive test and train data from database
        processed_train, processed_test = db_to_df(table_name,
                                                   'processed_data', engine)

        updated_df_train_f = processed_train.copy()
        updated_df_train_l = processed_train.copy()
        updated_df_test_f = processed_test.copy()
        updated_df_test_l = processed_test.copy()

        # Create features for test and train data
        features_train, train_student_ids = create_features(updated_df_train_f)
        features_test, test_student_ids = create_features(updated_df_test_f)

        # Create labels
        label_train = create_label(updated_df_train_l)
        label_test = create_label(updated_df_test_l)

        # Concatenating features and labels to save in the database
        train_concat = pd.concat([features_train, label_train],
                                 axis=1,
                                 sort=False)
        test_concat = pd.concat([features_test, label_test],
                                axis=1,
                                sort=False)

        # Calculating baseline rate using grade 9 gpa and base rate
        baseline_precision = baseline(test_concat, years_reference[i])
        base_rate = sum(train_concat.not_graduated) / len(train_concat)

        # Saving and reading from database
        df_to_db(table_name, 'model_data', train_concat, test_concat, engine)
        model_train, model_test = db_to_df(table_name, 'model_data', engine)

        features_train = model_train.iloc[:, :-1]
        label_train = model_train.iloc[:, -1]
        features_test = model_test.iloc[:, :-1]
        label_test = model_test.iloc[:, -1]

        # Build model
        algos = ["Logistic", "SVM", "RandomForest", "DecisionTree"]
        gs_params = {
            "Logistic":
            ParameterGrid({
                'solver': ['lbfgs', 'liblinear', 'saga'],
                'C': [0.001, 0.01, 0.1, 1, 2, 5, 10]
            }),
            "SVM":
            ParameterGrid({
                'C': [0.01, 1, 2, 5, 10],
                'kernel': ['rbf', 'sigmoid']
            }),
            "RandomForest":
            ParameterGrid({
                'n_estimators': [30, 50, 100, 500, 1000, 10000],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15],
                'max_features': ['auto', 'log2', 'sqrt']
            }),
            "DecisionTree":
            ParameterGrid({
                'criterion': ['gini', 'entropy'],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15]
            })
        }

        print('performing model grid search')
        for model_name in algos:
            params = gs_params[model_name]
            for param in params:
                model = build_model(features_train, label_train, model_name,
                                    param)

                # Perform prediction
                pred_proba_train = prediction(features_train, model)
                pred_proba_test = prediction(features_test, model)

                # Convert prediction probabilities to dataframes for further processing
                pred_train_df = pd.DataFrame(pred_proba_train,
                                             columns=['probability'])
                pred_test_df = pd.DataFrame(pred_proba_test,
                                            columns=['probability'])

                # Retreive hyperparameters for processing
                hyperparameters = ' '.join(
                    ["{}: {}".format(key, param[key]) for key in param.keys()])

                pred_train_df['model'], pred_train_df[
                    'params'] = model_name, hyperparameters
                pred_test_df['model'], pred_test_df[
                    'params'] = model_name, hyperparameters

                # Get the prediction scores for test and train data
                predictions_train = pd.concat(
                    [train_student_ids, pred_train_df], axis=1, sort=False)
                predictions_test = pd.concat([test_student_ids, pred_test_df],
                                             axis=1,
                                             sort=False)

                # Calculate the bias metrics
                TPR_gender, FDR_gender = bias_metrics(predictions_test,
                                                      processed_test, 'gender')
                TPR_disadvantagement, FDR_disadvantagement = bias_metrics(
                    predictions_test, processed_test, 'disadvantagement')

                # Load the prediction results to database for creating visualizations
                df_to_db(table_name, 'predictions', predictions_train,
                         predictions_test, engine)

                # Evaluate model
                metric = evaluate_model(features_test,
                                        label_test,
                                        model,
                                        model_name,
                                        baseline_precision,
                                        hyperparameters,
                                        columns=model_train.columns[:-1])

                # saving results
                df_summary = pd.DataFrame({
                    'test_year':
                    years_reference[i][1],
                    'train_since':
                    years_reference[i][0],
                    'algorithm':
                    model_name,
                    'hyperparameters':
                    hyperparameters,
                    'baserate':
                    base_rate,
                    'baseline': [baseline_precision],
                    'precision':
                    metric,
                    'TPR_gender':
                    TPR_gender,
                    'FDR_gender':
                    FDR_gender,
                    'TPR_disadvantagement':
                    TPR_disadvantagement,
                    'FDR_disadvantagement':
                    FDR_disadvantagement
                })
                df_summary.to_sql(name=timestamp,
                                  schema='performance_metrics',
                                  con=engine,
                                  if_exists='append',
                                  index=False)