예제 #1
0
    'latitude', 'longitude', 'out_of_state', 'luxury_make', 'domestic_make'
]

if __name__ == '__main__':
    use_project_path()

    logger = Logger('Data_Scratch/preprocess.txt')

    raw_record_count = -1
    processed_records = 0

    missing_data = {}
    for column in column_names:
        missing_data[column] = 0

    logger.time_log('Starting Data Pre-Processing...')

    with open('data_scratch/Parking_Citations.csv') as input_file:
        with open('data_scratch/preprocessed_citations.csv', 'w',
                  newline='') as output_file:
            reader = csv.reader(input_file)
            writer = csv.writer(output_file)

            writer.writerow(column_names)

            for i, line in enumerate(reader):
                raw_record_count += 1
                if raw_record_count == 0:
                    continue

                violation_code = line[14]
from utility import use_project_path, Logger
from data import column_names

if __name__ == "__main__":
    use_project_path()

    logger = Logger('data_scratch/sample_cleaned.txt')

    sample_seed = 1029
    sample_rate = 0.10
    raw_record_count = -1
    processed_records = 0

    random.seed(sample_seed)

    logger.time_log('Starting Data Sampling.')

    with open('data_scratch/cleaned_crimes.csv') as input_file:
        with open('data_scratch/sampled_cleaned_crimes.csv', 'w',
                  newline='') as output_file:
            reader = csv.reader(input_file)
            writer = csv.writer(output_file)

            writer.writerow(column_names)

            for i, line in enumerate(reader):
                raw_record_count += 1
                if raw_record_count == 0:
                    continue

                if random.random() < sample_rate:

if __name__ == '__main__':
    use_project_path()

    logger = Logger('Data_Scratch/categories.txt')

    raw_record_count = -1

    iucr_codes = {}
    type_codes = {}
    description_codes = {}
    location_codes = {}
    fbi_codes = {}

    logger.time_log('Starting Category Transformation.')
    with open('data_scratch/Crimes_-_2001_to_present.csv') as input_file:
        reader = csv.reader(input_file)

        for i, line in enumerate(reader):
            raw_record_count += 1
            if raw_record_count == 0:
                continue

            iucr = line[4]
            type = line[5]
            description = line[6]
            location = line[7]
            fbi_code = line[14]

            iucr_codes[iucr] = 1
    logger = Logger('Data_Scratch/preprocess.txt')
    iucr_codes = load_codes('data_scratch/iucr_codes.json')
    type_codes = load_codes('data_scratch/type_codes.json')
    description_codes = load_codes('data_scratch/description_codes.json')
    location_codes = load_codes('data_scratch/location_codes.json')
    fbi_codes = load_codes('data_scratch/fbi_codes.json')

    missing_columns = {}
    for column in column_names:
        missing_columns[column] = 0

    raw_record_count = -1
    processed_records = 0

    logger.time_log('Starting Pre-Processing.')
    with open('data_scratch/Crimes_-_2001_to_present.csv') as input_file:
        with open('data_scratch/preprocessed_crimes.csv', 'w',
                  newline='') as output_file:
            reader = csv.reader(input_file)
            writer = csv.writer(output_file)

            writer.writerow(column_names)

            for i, line in enumerate(reader):
                raw_record_count += 1
                if raw_record_count == 0:
                    continue

                arrest = line[8]
                date = line[2]
예제 #5
0
    def run_classification_search_experiment(
            self,
            scoring,
            sample=None,
            random_state=None,
            test_size=0.25,
            n_jobs=-1,
            n_iter=2,
            cv=5,
            verbose=3,
            multiclass=False,
            record_predict_proba=False):
        """
        The classification search makes use of a bayesian search to find the best hyper-parameters.
        """
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        search = BayesSearchCV(
            self.estimator,
            self.hyper_parameters.search_space,
            n_jobs=n_jobs,
            n_iter=n_iter,
            cv=cv,
            verbose=verbose,
            scoring=scoring,
            return_train_score=True
        )

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size)

        logger.time_log('Starting HyperParameter Search...')
        results = search.fit(x_train, y_train)
        logger.time_log('Search Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(results.best_estimator_, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(results.best_estimator_, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test)
            test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            results,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass
        )

        logger.close()

        self.hyper_parameters.params = results.best_params_
        self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = results.best_estimator_
예제 #6
0
    def run_classification_experiment(
            self,
            sample=None,
            random_state=None,
            test_size=0.25,
            multiclass=False,
            record_predict_proba=False):
        """
        Running a classification experiment is used when only a single model run and fit is necessary.
        """
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size)

        if self.hyper_parameters is not None:
            self.estimator.set_params(**self.hyper_parameters.params)

        logger.time_log('Training Model...')
        self.estimator.fit(x_train, y_train)
        logger.time_log('Training Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(self.estimator, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(self.estimator, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(self.estimator, x_test)
            test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            self.estimator,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass
        )

        logger.close()

        if self.hyper_parameters is not None:
            self.hyper_parameters.save('%s_params.p' % self.name)
예제 #7
0
    def run_classification_experiment(self,
                                      sample=None,
                                      random_state=None,
                                      test_size=0.20,
                                      multiclass=False,
                                      record_predict_proba=False,
                                      sampling=None,
                                      cv=5,
                                      verbose=True,
                                      transformer=None,
                                      fit_increment=None,
                                      warm_start=False,
                                      max_iters=None,
                                      n_jobs=-1):
        use_project_path()

        logger = Logger('%s.txt' % self.name)
        evaluator = Evaluator(logger)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(
            data_frame, data_frame[self.target], test_size=test_size)

        if transformer is not None:
            logger.time_log('Fitting Transformer...')
            transformer.fit(x_train)
            logger.time_log('Transformer Fit Complete.\n')

        if sampling is not None:
            logger.time_log('Starting Data Re-Sampling...')
            logger.log('Original Training Shape is %s' % Counter(y_train))
            x_new, y_new = sampling.fit_resample(x_train, y_train)
            logger.log('Balanced Training Shape is %s' % Counter(y_new))
            if hasattr(x_train, 'columns'):
                x_new = pd.DataFrame(x_new, columns=x_train.columns)
            x_train, y_train = x_new, y_new
            logger.time_log('Re-Sampling Complete.\n')
            logger.time_log('Shuffling Re-Sampled Data.\n')
            x_train, y_train = shuffle(x_train,
                                       y_train,
                                       random_state=random_state)
            logger.time_log('Shuffling Complete.\n')

        if self.hyper_parameters is not None:
            self.estimator.set_params(**self.hyper_parameters.params)

        if cv is not None:
            kfold = StratifiedKFold(n_splits=cv, random_state=random_state)
            logger.time_log('Cross Validating Model...')
            fold_scores = Parallel(n_jobs=n_jobs, verbose=3)(
                delayed(crossfold_classifier)
                (clone(self.estimator), transformer, x_train, y_train,
                 train_index, test_index, record_predict_proba, verbose,
                 fit_increment, warm_start, max_iters, random_state)
                for train_index, test_index in kfold.split(x_train, y_train))
            logger.time_log('Cross Validation Complete.\n')

        logger.time_log('Training Model...')
        if fit_increment is not None:
            if max_iters is not None:
                for iter in range(max_iters):
                    x_iter_train, y_iter_train = shuffle(
                        x_train, y_train, random_state=random_state)
                    batch_fit_classifier(self.estimator,
                                         x_iter_train,
                                         y_iter_train,
                                         transformer=transformer,
                                         increment=fit_increment,
                                         verbose=verbose)
            else:
                batch_fit_classifier(self.estimator,
                                     x_train,
                                     y_train,
                                     transformer=transformer,
                                     increment=fit_increment,
                                     verbose=verbose)
        else:
            if transformer is not None:
                x_train_transformed = transformer.transform(x_train)
                self.estimator.fit(x_train_transformed, y_train)
            else:
                self.estimator.fit(x_train, y_train)
        logger.time_log('Training Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(self.estimator,
                                        x_train,
                                        transformer=transformer,
                                        verbose=verbose)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(self.estimator,
                                       x_test,
                                       transformer=transformer,
                                       verbose=verbose)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(self.estimator,
                                                       x_test,
                                                       transformer=transformer,
                                                       verbose=verbose)
            test_proba_evaluation_frame = EvaluationFrame(
                y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        if cv is not None:
            evaluator.evaluate_fold_scores(fold_scores)

        evaluator.evaluate_classifier_result(
            self.estimator,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass)

        logger.close()

        if self.hyper_parameters is not None:
            self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = self.estimator
예제 #8
0
    def run_classification_search_experiment(self,
                                             scoring,
                                             sample=None,
                                             random_state=None,
                                             test_size=0.20,
                                             n_jobs=-1,
                                             n_iter=2,
                                             cv=5,
                                             verbose=3,
                                             multiclass=False,
                                             record_predict_proba=False,
                                             sampling=None):
        use_project_path()

        logger = Logger('%s.txt' % self.name)

        search = BayesSearchCV(self.estimator,
                               self.hyper_parameters.search_space,
                               n_jobs=n_jobs,
                               n_iter=n_iter,
                               cv=cv,
                               verbose=verbose,
                               scoring=scoring,
                               return_train_score=True)

        data_frame = self.df

        if sample is not None:
            data_frame = data_frame.sample(n=sample, random_state=random_state)

        x_train, x_test, y_train, y_test = train_test_split(
            data_frame, data_frame[self.target], test_size=test_size)

        if sampling is not None:
            logger.time_log('Starting Data Re-Sampling...')
            logger.log('Original Training Shape is %s' % Counter(y_train))
            x_new, y_new = sampling.fit_resample(x_train, y_train)
            logger.log('Balanced Training Shape is %s' % Counter(y_new))
            if hasattr(x_train, 'columns'):
                x_new = pd.DataFrame(x_new, columns=x_train.columns)
            x_train, y_train = x_new, y_new
            logger.time_log('Re-Sampling Complete.\n')
            logger.time_log('Shuffling Re-Sampled Data.\n')
            x_train, y_train = shuffle(x_train,
                                       y_train,
                                       random_state=random_state)
            logger.time_log('Shuffling Complete.\n')

        logger.time_log('Starting HyperParameter Search...')
        results = search.fit(x_train, y_train)
        logger.time_log('Search Complete.\n')

        logger.time_log('Testing Training Partition...')
        y_train_predict = batch_predict(results.best_estimator_, x_train)
        logger.time_log('Testing Complete.\n')

        train_evaluation_frame = EvaluationFrame(y_train, y_train_predict)

        logger.time_log('Testing Holdout Partition...')
        y_test_predict = batch_predict(results.best_estimator_, x_test)
        logger.time_log('Testing Complete.\n')

        test_evaluation_frame = EvaluationFrame(y_test, y_test_predict)
        test_evaluation_frame.save('%s_predict.p' % self.name)

        test_proba_evaluation_frame = None
        if record_predict_proba:
            logger.time_log('Testing Holdout Partition (probability)...')
            y_test_predict_proba = batch_predict_proba(results.best_estimator_,
                                                       x_test)
            test_proba_evaluation_frame = EvaluationFrame(
                y_test, y_test_predict_proba)
            test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name)
            logger.time_log('Testing Complete.\n')

        evaluator = Evaluator(logger)
        evaluator.evaluate_classifier_result(
            results,
            test_evaluation_frame,
            train=train_evaluation_frame,
            test_proba=test_proba_evaluation_frame,
            multiclass=multiclass)

        logger.close()

        self.hyper_parameters.params = results.best_params_
        self.hyper_parameters.save('%s_params.p' % self.name)

        self.trained_estimator = results.best_estimator_