def make_predictions(pipeline, feature_cols):
    """Make predictions on unlabeled data using the
       fitted pipeline and write predictions to CSV file.
    Args:
      pipeline (sklearn.): pipeline containing fitted model and transformations
      feature_cols (list): features present in training data used to train model
    Returns:
      None
    """
    logger.info('Making predictions on unlabeled data.........')
    unlabeled_data = preprocess(UNLABELED_DATA)
    unlabeled_data = unlabeled_data[FEATURES]
    unlabeled_data = transform(unlabeled_data)
    unlabeled_data = unlabeled_data.reindex(columns=feature_cols, fill_value=0)

    predicted_attendance = pipeline.predict(unlabeled_data)
    attendance_prob = pipeline.predict_proba(unlabeled_data)
    predictions = pd.DataFrame({
        'Name(Cand ID)':
        unlabeled_data.index.values,
        'Predicted Attendance':
        predicted_attendance,
        'Probability of Attendance':
        attendance_prob[:, 1]
    })
    predictions['Predicted Attendance'] = predictions[
        'Predicted Attendance'].map({
            1: 'YES',
            0: 'NO'
        })
    predictions['Probability of Attendance'] = predictions[
        'Probability of Attendance'].round(decimals=3)
    predictions.to_csv(path_or_buf=PREDICTIONS, index=False)
    n_pred = predictions.shape[0]
    logger.info(f'Wrote predictions for {n_pred} candidates to {PREDICTIONS}')
def impute_missing_values(df):
    """Fill null feature values with modal values
    Args:
       df (pd.DataFrame): feature data with Nan values
    Returns
       pd.DataFrame with imputed valuees
    """
    for column in FEATURES:
        mode = df[column].mode()[0]
        df[column] = df[column].fillna(mode)
        logger.info(f'Filled NAs of column {column} with {mode}')
    return df
def create_train_test_datasets(df, test_size):
    """Splits labeled dataset in train and test sets
    Args:
      df (pd.DataFrame):
      test_size (float): percentage of labeled data to be used in test set
    Returns:
      pd.DataFrames of train and test feature and target data (i.e.
      x_train, x_test, y_train, y_test)
    """
    X = df[FEATURES]
    X = transform(X)
    Y = df[LABEL].map({'YES': 1, 'NO': 0})
    logger.info(
        'Splitting labeled data into train and test set with {} split'.format(
            str(int((1 - test_size) * 100)) + '/' + str(int(test_size * 100))))
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size,
                                                        stratify=Y,
                                                        random_state=42)
    logger.info('Training set created with {} rows and {} features'.format(
        x_train.shape[0], x_train.shape[1]))
    logger.info('Test set created with {} rows and {} features'.format(
        x_test.shape[0], x_test.shape[1]))
    return x_train, x_test, y_train, y_test
def train_and_evaluate(cross_validate, cv_metric=None):
    """Train and evaluate model
    Args:
      cross_validate (boolean): execute hyper-parameter tuning via grid search
      cv_metric (str): metric to optimize/minimize during cross-valiation
    Returns:
      trained pipeline
    """
    labeled_data = preprocess(LABELED_DATA)
    x_train, x_test, y_train, y_test = create_train_test_datasets(
        labeled_data, test_size=0.2)

    if cross_validate:
        params = tune_hyperparameters(x_train,
                                      x_test,
                                      y_train,
                                      y_test,
                                      metric=cv_metric)
    else:
        # Parameters learned from previous cross-validation run
        params = {'C': 10, 'gamma': 0.001, 'kernel': 'rbf', 'tol': 0.01}

    pipeline = Pipeline([('feature_interactions',
                          PolynomialFeatures(interaction_only=True)),
                         ('classifier',
                          SVC(kernel=params['kernel'],
                              gamma=params['gamma'],
                              C=params['C'],
                              tol=params['tol'],
                              probability=True))])

    logger.info('Fiting pipeline on training data set.........')
    pipeline.fit(x_train, y_train)
    predicted = pipeline.predict(x_test)
    calculate_metrics(predicted, y_test)

    training_columns = x_train.columns
    return pipeline, training_columns
def tune_hyperparameters(x_train, x_test, y_train, y_test, metric):
    """Use cross-validation and grid search to find optimal
       hyper-parameters to find the best parameters on
       the training set.
     Args:
       metric (str): evaluation metric to optimize for
    """
    pipeline = Pipeline([
        # NOTE: CV is takes a while when PolynomialFeatures are included
        # ('features', PolynomialFeatures()),
        ('svc', SVC())
    ])

    parameters = {
        # 'features__degree': [2],
        # 'features__include_bias': (True, False),
        # 'features__interaction_only': (True, False),
        'svc__kernel': ['linear', 'rbf'],
        'svc__gamma': [1e-3, 1e-4],
        'svc__C': [1, 10, 100, 1000],
        'svc__tol': [1e-2, 1e-3, 1e-4]
    }

    logger.info(f"Tuning hyper-parameters for {metric}")
    clf = GridSearchCV(pipeline, parameters, cv=10, scoring=f'{metric}_macro')
    clf.fit(x_train, y_train)

    best_params = clf.best_params_

    logger.info("Grid scores on training set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        logger.info(f"{mean:.3} (+/-%{std*2:.3}) for {params}")
    logger.info("Classification report:")
    y_true, y_pred = y_test, clf.predict(x_test)
    logger.info(classification_report(y_true, y_pred))
    best_params = {
        param.split('__')[1]: value
        for param, value in best_params.items()
    }
    logger.info(f"Best parameters:")
    for metric, value in best_params.items():
        logger.info(f"{metric}: {value}")
    return best_params
def transform(df):
    """Transform categorical features into numerical representations."""
    logger.info('Converting categorical variables into dummy variables')
    return pd.get_dummies(df, columns=FEATURES)
def save_model(pipeline,
               model_type='svm',
               date=datetime.now().strftime("%m-%d-%Y")):
    model_name = f'{model_type}-model-{date}.joblib'
    logger.info(f'Writing {model_name} to {MODEL_DIR}')
    joblib.dump(pipeline, os.path.join(MODEL_DIR, model_name), compress=1)
def calculate_metrics(y_pred, y_true):
    """Calculate performance metrics"""
    log_msg = f'Test Set Evaluation Metrics'
    logger.info(len(log_msg) * '-')
    logger.info(log_msg)
    logger.info(len(log_msg) * '-')

    baseline_accuracy = round(sum(y_true.values) / len(y_true.values), 3)
    logger.info(f'Baseline Accuracy: {baseline_accuracy:.2f}')

    metrics = {
        'Accuracy': accuracy_score,
        'Precision': precision_score,
        'Recall': recall_score
    }

    for metric_name, metric_fn in metrics.items():
        score = metric_fn(y_true, y_pred)
        logger.info(f'{metric_name}: {score:.2f}')

    fpr, tpr, thresholds = roc_curve(y_true.values, y_pred, pos_label=1)
    auc_score = auc(fpr, tpr)
    logger.info(f'AUC: {auc_score:.2f}')
    logger.info(len(log_msg) * '-')