def add_flags_to_predictions_df(df): ''' Given a data frame with the following format: - Indexes: parcel_id, inspection_date - Columns: prediction, viol_outcome Attach columns with various metrics and flags ''' #Copy df to avoid overwriting df = deepcopy(df) #Calculate precisions at 1 and 10 percent prec1, cutoff1 = precision_at(df.viol_outcome, df.prediction, percent=0.01) prec10, cutoff10 = precision_at(df.viol_outcome, df.prediction, percent=0.1) #Add columns df['top_1'] = df.prediction.map(lambda x: x >= cutoff1) df['top_10'] = df.prediction.map(lambda x: x >= cutoff10) df['tp_top_1'] = df.top_1 & df.viol_outcome df['tp_top_10'] = df.top_10 & df.viol_outcome df['fp_top_1'] = df.top_1 & ~df.viol_outcome df['fp_top_10'] = df.top_10 & ~df.viol_outcome return df
def test_baseline_precision_with_nas(self): labels = np.array([nan, 1, nan, 1, 1, nan, nan, 0, 0, 0]) scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10]) prec, cutoff = precision_at( labels, scores, proportion=1.0, ignore_nas=True) self.assertEqual(prec, 0.5) self.assertEqual(cutoff, 10)
def log_results(model, config, test, predictions, feature_importances, imputer, scaler): ''' Log results to a MongoDB database ''' #Instantiate logger logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) #Compute some statistics to log prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01) prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1) #Add the name of the experiment if available experiment_name = config["experiment_name"] if config["experiment_name"] else None #Sending model will log model name, parameters and datetime #Also log other important things by sending named parameters mongo_id = mongo_logger.log_model(model, features=list(test.feature_names), feature_importances=list(feature_importances), config=config, prec_at_1=prec_at_1, prec_at_10=prec_at_10, cutoff_at_1=cutoff_at_1, cutoff_at_10=cutoff_at_10, experiment_name=experiment_name, feature_mapping=test.feature_mapping) #Dump test_labels, test_predictions and test_parcels to a csv file parcel_id = [record[0] for record in test.parcels] inspection_date = [record[1] for record in test.parcels] dump = pd.DataFrame({'parcel_id': parcel_id, 'inspection_date': inspection_date, 'viol_outcome': test.y, 'prediction': predictions}) #Dump predictions to CSV dump.to_csv(os.path.join(path_to_predictions, mongo_id)) #Pickle model if args.pickle: path_to_file = os.path.join(path_to_pickled_models, mongo_id) logger.info('Pickling model: {}'.format(path_to_file)) joblib.dump(model, path_to_file) path_to_file = os.path.join(path_to_pickled_imputers, mongo_id) logger.info('Pickling imputer: {}'.format(path_to_file)) joblib.dump(imputer, path_to_file) path_to_file = os.path.join(path_to_pickled_scalers, mongo_id) logger.info('Pickling scaler: {}'.format(path_to_file)) joblib.dump(scaler, path_to_file)
def output_evaluation_statistics(test, predictions): logger.info("Statistics with probability cutoff at 0.5") # binary predictions with some cutoff for these evaluations cutoff = 0.5 predictions_binary = np.copy(predictions) predictions_binary[predictions_binary >= cutoff] = 1 predictions_binary[predictions_binary < cutoff] = 0 evaluation.print_model_statistics(test.y, predictions_binary) evaluation.print_confusion_matrix(test.y, predictions_binary) precision1 = precision_at(test.y, predictions, 0.01) logger.debug("Precision at 1%: {} (probability cutoff {})".format( round(precision1[0], 2), precision1[1])) precision10 = precision_at(test.y, predictions, 0.1) logger.debug("Precision at 10%: {} (probability cutoff {})".format( round(precision10[0], 2), precision10[1]))
def test_perfect_precision_with_nas(self): labels = np.array([1, nan, 1, 1, 1, nan, 0, 0, 0, 0]) scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10]) prec, cutoff = precision_at(labels, scores, top_proportion=0.10, ignore_nas=True) self.assertEqual(prec, 1.0) self.assertEqual(cutoff, 100)
def add_flags_to_predictions_df(df): ''' Given a data frame with the following format: - Indexes: parcel_id, inspection_date - Columns: prediction, viol_outcome Attach columns with various metrics and flags ''' #Copy df to avoid overwriting df = deepcopy(df) #Calculate precisions at 1 and 10 percent prec1, cutoff1 = precision_at(df.viol_outcome, df.prediction, percent=0.01) prec10, cutoff10 = precision_at(df.viol_outcome, df.prediction, percent=0.1) #Add columns df['top_1'] = df.prediction.map(lambda x: x>= cutoff1) df['top_10'] = df.prediction.map(lambda x: x>= cutoff10) df['tp_top_1'] = df.top_1 & df.viol_outcome df['tp_top_10'] = df.top_10 & df.viol_outcome df['fp_top_1'] = df.top_1 & ~df.viol_outcome df['fp_top_10'] = df.top_10 & ~df.viol_outcome return df
def test_baseline_precision_with_nas(self): labels = np.array([nan, 1, nan, 1, 1, nan, nan, 0, 0, 0]) scores = np.array([100,90,80,70,60,50,40,30,20,10]) prec, cutoff = precision_at(labels, scores, proportion=1.0, ignore_nas=True) self.assertEqual(prec, 0.5) self.assertEqual(cutoff, 10)
def test_baseline_precision(self): labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) scores = np.array([100,90,80,70,60,50,40,30,20,10]) prec, cutoff = precision_at(labels, scores, proportion=1.0) self.assertEqual(prec, 0.5) self.assertEqual(cutoff, 10)
def test_baseline_precision(self): labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10]) prec, cutoff = precision_at(labels, scores, top_proportion=1.0) self.assertEqual(prec, 0.5) self.assertEqual(cutoff, 10)
def log_results(model, config, test, predictions, feature_importances, imputer, scaler): ''' Log results to a MongoDB database ''' # Instantiate logger logger_uri = cfg_main['logger']['uri'] logger_db = cfg_main['logger']['db'] logger_collection = cfg_main['logger']['collection'] mongo_logger = Logger(logger_uri, logger_db, logger_collection) # Compute some statistics to log prec_at_1, cutoff_at_1 = precision_at(test.y, predictions, 0.01) prec_at_5, cutoff_at_5 = precision_at(test.y, predictions, 0.05) prec_at_10, cutoff_at_10 = precision_at(test.y, predictions, 0.1) prec_at_20, cutoff_at_20 = precision_at(test.y, predictions, 0.2) # Add the name of the experiment if available experiment_name = (config["experiment_name"] if config["experiment_name"] else None) # Sending model will log model name, parameters and datetime # Also log other important things by sending named parameters ft_imp = list(feature_importances) ft_map = test.feature_mapping mongo_id = mongo_logger.log_model(model, features=list(test.feature_names), feature_importances=ft_imp, config=config, prec_at_1=prec_at_1, cutoff_at_1=cutoff_at_1, prec_at_5=prec_at_5, cutoff_at_5=cutoff_at_5, prec_at_10=prec_at_10, cutoff_at_10=cutoff_at_10, prec_at_20=prec_at_20, cutoff_at_20=cutoff_at_20, experiment_name=experiment_name, feature_mapping=ft_map) # Dump test_labels, test_predictions and test_parcels to a csv file parcel_id = [record[0] for record in test.parcels] inspection_date = [record[1] for record in test.parcels] dump = pd.DataFrame({'parcel_id': parcel_id, 'inspection_date': inspection_date, 'viol_outcome': test.y, 'prediction': predictions}) # Dump predictions to CSV dump.to_csv(os.path.join(path_to_predictions, mongo_id)) # Pickle model if args.pickle: path_to_file = os.path.join(path_to_pickled_models, mongo_id) logger.info('Pickling model: {}'.format(path_to_file)) joblib.dump(model, path_to_file) path_to_file = os.path.join(path_to_pickled_imputers, mongo_id) logger.info('Pickling imputer: {}'.format(path_to_file)) joblib.dump(imputer, path_to_file) path_to_file = os.path.join(path_to_pickled_scalers, mongo_id) logger.info('Pickling scaler: {}'.format(path_to_file)) joblib.dump(scaler, path_to_file)
def test_perfect_precision(self): labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) scores = np.array([100, 90, 80, 70, 60, 50, 40, 30, 20, 10]) prec, cutoff = precision_at(labels, scores, percent=0.10) self.assertEqual(prec, 1.0) self.assertEqual(cutoff, 100)
def test_perfect_precision_with_nas(self): labels = np.array([1, nan, 1, 1, 1 , nan, 0, 0, 0, 0]) scores = np.array([100,90,80,70,60,50,40,30,20,10]) prec, cutoff = precision_at(labels, scores, percent=0.10, ignore_nas=True) self.assertEqual(prec, 1.0) self.assertEqual(cutoff, 100)