예제 #1
0
def add_flags_to_predictions_df(df):
    '''
        Given a data frame with the following format:
            - Indexes: parcel_id, inspection_date
            - Columns: prediction, viol_outcome

        Attach columns with various metrics and flags
    '''
    #Copy df to avoid overwriting
    df = deepcopy(df)

    #Calculate precisions at 1 and 10 percent
    prec1, cutoff1 = precision_at(df.viol_outcome, df.prediction, percent=0.01)
    prec10, cutoff10 = precision_at(df.viol_outcome,
                                    df.prediction,
                                    percent=0.1)

    #Add columns
    df['top_1'] = df.prediction.map(lambda x: x >= cutoff1)
    df['top_10'] = df.prediction.map(lambda x: x >= cutoff10)
    df['tp_top_1'] = df.top_1 & df.viol_outcome
    df['tp_top_10'] = df.top_10 & df.viol_outcome
    df['fp_top_1'] = df.top_1 & ~df.viol_outcome
    df['fp_top_10'] = df.top_10 & ~df.viol_outcome
    return df
예제 #2
0
 def test_baseline_precision_with_nas(self):
     labels = np.array([nan, 1, nan, 1, 1, nan, nan, 0, 0, 0])
     scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10])
     prec, cutoff = precision_at(
         labels, scores, proportion=1.0, ignore_nas=True)
     self.assertEqual(prec, 0.5)
     self.assertEqual(cutoff, 10)
예제 #3
0
def log_results(model, config, test, predictions, feature_importances,
    imputer, scaler):
    '''
        Log results to a MongoDB database
    '''
    #Instantiate logger
    logger_uri = cfg_main['logger']['uri']
    logger_db = cfg_main['logger']['db']
    logger_collection = cfg_main['logger']['collection']
    mongo_logger = Logger(logger_uri, logger_db, logger_collection)
    #Compute some statistics to log
    prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01)
    prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1)
    #Add the name of the experiment if available
    experiment_name = config["experiment_name"] if config["experiment_name"] else None
    #Sending model will log model name, parameters and datetime
    #Also log other important things by sending named parameters
    mongo_id = mongo_logger.log_model(model, features=list(test.feature_names),
        feature_importances=list(feature_importances),
        config=config, prec_at_1=prec_at_1,
        prec_at_10=prec_at_10, cutoff_at_1=cutoff_at_1,
        cutoff_at_10=cutoff_at_10, experiment_name=experiment_name,
        feature_mapping=test.feature_mapping)

    #Dump test_labels, test_predictions and test_parcels to a csv file
    parcel_id = [record[0] for record in test.parcels]
    inspection_date = [record[1] for record in test.parcels]
    dump = pd.DataFrame({'parcel_id': parcel_id,
        'inspection_date': inspection_date,
        'viol_outcome': test.y,
        'prediction': predictions})
    #Dump predictions to CSV
    dump.to_csv(os.path.join(path_to_predictions, mongo_id))
    #Pickle model
    if args.pickle:
        path_to_file = os.path.join(path_to_pickled_models, mongo_id)
        logger.info('Pickling model: {}'.format(path_to_file))
        joblib.dump(model, path_to_file)

        path_to_file = os.path.join(path_to_pickled_imputers, mongo_id)
        logger.info('Pickling imputer: {}'.format(path_to_file))
        joblib.dump(imputer, path_to_file)

        path_to_file = os.path.join(path_to_pickled_scalers, mongo_id)
        logger.info('Pickling scaler: {}'.format(path_to_file))
        joblib.dump(scaler, path_to_file)
예제 #4
0
def output_evaluation_statistics(test, predictions):
    logger.info("Statistics with probability cutoff at 0.5")
    # binary predictions with some cutoff for these evaluations
    cutoff = 0.5
    predictions_binary = np.copy(predictions)
    predictions_binary[predictions_binary >= cutoff] = 1
    predictions_binary[predictions_binary < cutoff] = 0

    evaluation.print_model_statistics(test.y, predictions_binary)
    evaluation.print_confusion_matrix(test.y, predictions_binary)

    precision1 = precision_at(test.y, predictions, 0.01)
    logger.debug("Precision at 1%: {} (probability cutoff {})".format(
                 round(precision1[0], 2), precision1[1]))
    precision10 = precision_at(test.y, predictions, 0.1)
    logger.debug("Precision at 10%: {} (probability cutoff {})".format(
                 round(precision10[0], 2), precision10[1]))
예제 #5
0
def output_evaluation_statistics(test, predictions):
    logger.info("Statistics with probability cutoff at 0.5")
    # binary predictions with some cutoff for these evaluations
    cutoff = 0.5
    predictions_binary = np.copy(predictions)
    predictions_binary[predictions_binary >= cutoff] = 1
    predictions_binary[predictions_binary < cutoff] = 0

    evaluation.print_model_statistics(test.y, predictions_binary)
    evaluation.print_confusion_matrix(test.y, predictions_binary)

    precision1 = precision_at(test.y, predictions, 0.01)
    logger.debug("Precision at 1%: {} (probability cutoff {})".format(
                 round(precision1[0], 2), precision1[1]))
    precision10 = precision_at(test.y, predictions, 0.1)
    logger.debug("Precision at 10%: {} (probability cutoff {})".format(
                 round(precision10[0], 2), precision10[1]))
예제 #6
0
 def test_perfect_precision_with_nas(self):
     labels = np.array([1, nan, 1, 1, 1, nan, 0, 0, 0, 0])
     scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10])
     prec, cutoff = precision_at(labels,
                                 scores,
                                 top_proportion=0.10,
                                 ignore_nas=True)
     self.assertEqual(prec, 1.0)
     self.assertEqual(cutoff, 100)
예제 #7
0
def add_flags_to_predictions_df(df):
    '''
        Given a data frame with the following format:
            - Indexes: parcel_id, inspection_date
            - Columns: prediction, viol_outcome

        Attach columns with various metrics and flags
    '''
    #Copy df to avoid overwriting
    df = deepcopy(df)

    #Calculate precisions at 1 and 10 percent
    prec1, cutoff1 = precision_at(df.viol_outcome, df.prediction, percent=0.01)
    prec10, cutoff10 = precision_at(df.viol_outcome, df.prediction, percent=0.1)

    #Add columns
    df['top_1'] = df.prediction.map(lambda x: x>= cutoff1)
    df['top_10'] = df.prediction.map(lambda x: x>= cutoff10)
    df['tp_top_1'] = df.top_1 & df.viol_outcome
    df['tp_top_10'] = df.top_10 & df.viol_outcome
    df['fp_top_1'] = df.top_1 & ~df.viol_outcome
    df['fp_top_10'] = df.top_10 & ~df.viol_outcome
    return df
예제 #8
0
 def test_baseline_precision_with_nas(self):
     labels = np.array([nan, 1, nan, 1, 1, nan, nan, 0, 0, 0])
     scores = np.array([100,90,80,70,60,50,40,30,20,10])
     prec, cutoff = precision_at(labels, scores, proportion=1.0, ignore_nas=True)
     self.assertEqual(prec, 0.5)
     self.assertEqual(cutoff, 10)
예제 #9
0
 def test_baseline_precision(self):
     labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
     scores = np.array([100,90,80,70,60,50,40,30,20,10])
     prec, cutoff = precision_at(labels, scores, proportion=1.0)
     self.assertEqual(prec, 0.5)
     self.assertEqual(cutoff, 10)
예제 #10
0
 def test_baseline_precision(self):
     labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
     scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10])
     prec, cutoff = precision_at(labels, scores, top_proportion=1.0)
     self.assertEqual(prec, 0.5)
     self.assertEqual(cutoff, 10)
예제 #11
0
def log_results(model, config, test, predictions, feature_importances,
                imputer, scaler):
    '''
        Log results to a MongoDB database
    '''
    # Instantiate logger
    logger_uri = cfg_main['logger']['uri']
    logger_db = cfg_main['logger']['db']
    logger_collection = cfg_main['logger']['collection']
    mongo_logger = Logger(logger_uri, logger_db, logger_collection)
    # Compute some statistics to log
    prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01)
    prec_at_5, cutoff_at_5 = precision_at(test.y, predictions, 0.05)
    prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1)
    prec_at_20, cutoff_at_20 = precision_at(test.y, predictions, 0.2)

    # Add the name of the experiment if available
    experiment_name = (config["experiment_name"] if config["experiment_name"]
                       else None)
    # Sending model will log model name, parameters and datetime
    # Also log other important things by sending named parameters

    ft_imp = list(feature_importances)
    ft_map = test.feature_mapping

    mongo_id = mongo_logger.log_model(model,
                                      features=list(test.feature_names),
                                      feature_importances=ft_imp,
                                      config=config,
                                      prec_at_1=prec_at_1,
                                      cutoff_at_1=cutoff_at_1,
                                      prec_at_5=prec_at_5,
                                      cutoff_at_5=cutoff_at_5,
                                      prec_at_10=prec_at_10,
                                      cutoff_at_10=cutoff_at_10,
                                      prec_at_20=prec_at_20,
                                      cutoff_at_20=cutoff_at_20,
                                      experiment_name=experiment_name,
                                      feature_mapping=ft_map)

    # Dump test_labels, test_predictions and test_parcels to a csv file
    parcel_id = [record[0] for record in test.parcels]
    inspection_date = [record[1] for record in test.parcels]
    dump = pd.DataFrame({'parcel_id': parcel_id,
                         'inspection_date': inspection_date,
                         'viol_outcome': test.y,
                         'prediction': predictions})
    # Dump predictions to CSV
    dump.to_csv(os.path.join(path_to_predictions, mongo_id))
    # Pickle model
    if args.pickle:
        path_to_file = os.path.join(path_to_pickled_models, mongo_id)
        logger.info('Pickling model: {}'.format(path_to_file))
        joblib.dump(model, path_to_file)

        path_to_file = os.path.join(path_to_pickled_imputers, mongo_id)
        logger.info('Pickling imputer: {}'.format(path_to_file))
        joblib.dump(imputer, path_to_file)

        path_to_file = os.path.join(path_to_pickled_scalers, mongo_id)
        logger.info('Pickling scaler: {}'.format(path_to_file))
        joblib.dump(scaler, path_to_file)
예제 #12
0
 def test_perfect_precision(self):
     labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
     scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10])
     prec, cutoff = precision_at(labels, scores, percent=0.10)
     self.assertEqual(prec, 1.0)
     self.assertEqual(cutoff, 100)
예제 #13
0
 def test_perfect_precision_with_nas(self):
     labels = np.array([1, nan, 1, 1, 1 , nan, 0, 0, 0, 0])
     scores = np.array([100,90,80,70,60,50,40,30,20,10])
     prec, cutoff = precision_at(labels, scores, percent=0.10, ignore_nas=True)
     self.assertEqual(prec, 1.0)
     self.assertEqual(cutoff, 100)