'latitude', 'longitude', 'out_of_state', 'luxury_make', 'domestic_make' ] if __name__ == '__main__': use_project_path() logger = Logger('Data_Scratch/preprocess.txt') raw_record_count = -1 processed_records = 0 missing_data = {} for column in column_names: missing_data[column] = 0 logger.time_log('Starting Data Pre-Processing...') with open('data_scratch/Parking_Citations.csv') as input_file: with open('data_scratch/preprocessed_citations.csv', 'w', newline='') as output_file: reader = csv.reader(input_file) writer = csv.writer(output_file) writer.writerow(column_names) for i, line in enumerate(reader): raw_record_count += 1 if raw_record_count == 0: continue violation_code = line[14]
from utility import use_project_path, Logger from data import column_names if __name__ == "__main__": use_project_path() logger = Logger('data_scratch/sample_cleaned.txt') sample_seed = 1029 sample_rate = 0.10 raw_record_count = -1 processed_records = 0 random.seed(sample_seed) logger.time_log('Starting Data Sampling.') with open('data_scratch/cleaned_crimes.csv') as input_file: with open('data_scratch/sampled_cleaned_crimes.csv', 'w', newline='') as output_file: reader = csv.reader(input_file) writer = csv.writer(output_file) writer.writerow(column_names) for i, line in enumerate(reader): raw_record_count += 1 if raw_record_count == 0: continue if random.random() < sample_rate:
if __name__ == '__main__': use_project_path() logger = Logger('Data_Scratch/categories.txt') raw_record_count = -1 iucr_codes = {} type_codes = {} description_codes = {} location_codes = {} fbi_codes = {} logger.time_log('Starting Category Transformation.') with open('data_scratch/Crimes_-_2001_to_present.csv') as input_file: reader = csv.reader(input_file) for i, line in enumerate(reader): raw_record_count += 1 if raw_record_count == 0: continue iucr = line[4] type = line[5] description = line[6] location = line[7] fbi_code = line[14] iucr_codes[iucr] = 1
logger = Logger('Data_Scratch/preprocess.txt') iucr_codes = load_codes('data_scratch/iucr_codes.json') type_codes = load_codes('data_scratch/type_codes.json') description_codes = load_codes('data_scratch/description_codes.json') location_codes = load_codes('data_scratch/location_codes.json') fbi_codes = load_codes('data_scratch/fbi_codes.json') missing_columns = {} for column in column_names: missing_columns[column] = 0 raw_record_count = -1 processed_records = 0 logger.time_log('Starting Pre-Processing.') with open('data_scratch/Crimes_-_2001_to_present.csv') as input_file: with open('data_scratch/preprocessed_crimes.csv', 'w', newline='') as output_file: reader = csv.reader(input_file) writer = csv.writer(output_file) writer.writerow(column_names) for i, line in enumerate(reader): raw_record_count += 1 if raw_record_count == 0: continue arrest = line[8] date = line[2]
def run_classification_search_experiment( self, scoring, sample=None, random_state=None, test_size=0.25, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False): """ The classification search makes use of a bayesian search to find the best hyper-parameters. """ use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV( self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True ) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_
def run_classification_experiment( self, sample=None, random_state=None, test_size=0.25, multiclass=False, record_predict_proba=False): """ Running a classification experiment is used when only a single model run and fit is necessary. """ use_project_path() logger = Logger('%s.txt' % self.name) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) logger.time_log('Training Model...') self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name)
def run_classification_experiment(self, sample=None, random_state=None, test_size=0.20, multiclass=False, record_predict_proba=False, sampling=None, cv=5, verbose=True, transformer=None, fit_increment=None, warm_start=False, max_iters=None, n_jobs=-1): use_project_path() logger = Logger('%s.txt' % self.name) evaluator = Evaluator(logger) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if transformer is not None: logger.time_log('Fitting Transformer...') transformer.fit(x_train) logger.time_log('Transformer Fit Complete.\n') if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) if cv is not None: kfold = StratifiedKFold(n_splits=cv, random_state=random_state) logger.time_log('Cross Validating Model...') fold_scores = Parallel(n_jobs=n_jobs, verbose=3)( delayed(crossfold_classifier) (clone(self.estimator), transformer, x_train, y_train, train_index, test_index, record_predict_proba, verbose, fit_increment, warm_start, max_iters, random_state) for train_index, test_index in kfold.split(x_train, y_train)) logger.time_log('Cross Validation Complete.\n') logger.time_log('Training Model...') if fit_increment is not None: if max_iters is not None: for iter in range(max_iters): x_iter_train, y_iter_train = shuffle( x_train, y_train, random_state=random_state) batch_fit_classifier(self.estimator, x_iter_train, y_iter_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: batch_fit_classifier(self.estimator, x_train, y_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: if transformer is not None: x_train_transformed = transformer.transform(x_train) self.estimator.fit(x_train_transformed, y_train) else: self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test, transformer=transformer, verbose=verbose) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') if cv is not None: evaluator.evaluate_fold_scores(fold_scores) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = self.estimator
def run_classification_search_experiment(self, scoring, sample=None, random_state=None, test_size=0.20, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False, sampling=None): use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV(self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_