Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Multilayer neural network parser')
    parser.add_argument('-d',
                        '--dataset',
                        help='The name (without extension) of the dataset',
                        required=True)
    parser.add_argument('-n',
                        '--network',
                        help='The filename of the network configuration',
                        required=False)
    args = parser.parse_args()

    try:
        with open(DATA_PATH + args.dataset + '.json', 'r') as filetypes:
            types = json.load(filetypes)
    except:
        print('Dataset types not found, automatic types will be used.')
        types = {}

    df = preprocess(
        pd.read_csv(DATA_PATH + args.dataset + '.tsv', sep='\t', dtype=types),
        types)
    network_file = open(args.network, 'r')
    cv = CrossValidator(
        NeuralNetwork(network_file=network_file,
                      target_attribute='target',
                      data_instance=df.iloc[0]))
    cv.cross_validate(df, 10, 1)
Exemplo n.º 2
0
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()
    def __init__(self):
        # logger instance - VERBOSE level is highest (most verbose) level for logging
        self.logger = Logger('DEMO')  # configure log level here

        # datalayer instance - read csv data files and convert into raw data frames
        self.datalayer = DataApi('../../data/')
        # preprocessor instance - everything for prerocessing data frames
        self.preprocessor = Preprocessor()
        # cross_validator instance - setup cross validation partitions
        self.cross_validator = CrossValidator()
        # utils instance - random things
        self.utils = Utils()
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description='Random Forest parser')
    parser.add_argument('--opt', help='test-benchmark or test-dataset.', required=True)
    parser.add_argument('--dataset', help='The dataset filename.', default='', required=False)
    parser.add_argument('--target_attribute', help='Target attribute to be predicted.', default='', required=False)
    parser.add_argument('--n_trees', help='The number of trees. The default is 5.', default=5, type=int, required=False)
    parser.add_argument('--n_attributes', help='The number of attributes. The default is the squared root of otal attributes.', default=-1, type=int, required=False)
    parser.add_argument('--k_folds', help='The number of folds for cross validation. The default is 5', default=5, type=int, required=False)
    parser.add_argument('--r', help='The number of repetitions for repeated cross validation. The default is 1', default=1, type=int, required=False)
    args = parser.parse_args()

    if args.opt == 'test-benchmark':
        test_benchmark_categorical()
        test_benchmark_numerical()

    if args.opt == 'test-dataset':
        if args.dataset == '' or not os.path.isfile(DATA_PATH + args.dataset):
            print('Dataset not found.')
            return

        try:
            with open(DATA_PATH + args.dataset[:-3] + 'json', 'r') as filetypes:
                types = json.load(filetypes)
        except:
            print('Dataset types not found, automatic types will be used.')
            types = {}

        data = pd.read_csv(
            DATA_PATH + args.dataset,
            delimiter='\t' if args.dataset[-3:] == 'tsv' else ',',
            dtype=types
        )

        if args.target_attribute not in data.columns:
            print("Target attribute doesn't exist on dataset.")
            return

        n_trees = args.n_trees
        n_random_attributes = args.n_attributes
        if n_random_attributes == -1:
            n_random_attributes = int((len(data.columns) - 1) ** 1/2)

        cv = CrossValidator(
            RandomForest(n_trees, args.target_attribute, n_random_attributes)
        )
        cv.cross_validate(data, args.k_folds, args.r)
        print('\nGlobal accuracy: %.3f (%.3f)' % (cv.accuracy, cv.accuracy_std))
Exemplo n.º 5
0
    def __init__(self):
        self.DEBUG = False

        # get instances of all the classes needed to run an experiment
        self.data_api_impl = DataApi('../../data/')
        self.preprocessor_impl = Preprocessor()
        self.cross_validator_impl = CrossValidator()
        self.parameter_tuner_impl = ParameterTuner()

        # algorithm implementations
        self.knn_impl = KNN()
        self.enn_impl = EditedKNN()
        self.cnn_impl = CondensedKNN()
        self.kmeans_knn_impl = KMeansClustering()
        self.k_medoids_clustering_impl = KMedoidsClustering()

        self.results_processor_impl = Results()

        self.CLASSIFICATION = False
        self.REGRESSION = False
Exemplo n.º 6
0
def test():
    lvl = 1
    wavelet = 'db4'  # Haar'
    ts_file_name = 'ford_ts.csv'
    last_days = 1200
    time_frame = 60
    time_bias = 1

    data_loader = DataLoader(ts_file_name, last_days, debug=True)

    raw_data = data_loader.as_matrix()
    ts_data = denoise(raw_data, lvl, wavelet)

    # plt.plot(raw_data[3])
    # plt.show()
    # plt.plot(ts_data[3])
    # plt.show()

    daily_features, _ = np.shape(ts_data)
    dataset = data_loader.prepare_dataset_sae(ts_data, time_frame, time_bias)

    runner = Runner(daily_features,
                    lstm_layers=1,
                    gamma=0.005,
                    delay=4,
                    sae_lr=0.01,
                    beta=0,
                    hidden_nodes_activation_rate=0.9,
                    hidden_layers_sizes=[8],
                    debug=True)

    cross_validator = CrossValidator()
    pred_target = cross_validator.run_validation(runner,
                                                 dataset,
                                                 sae_epoch=1,
                                                 lstm_epoch=1)
    pred_target_dollars = [(data_loader.to_dolar(x), data_loader.to_dolar(y))
                           for x, y in pred_target]
    dollars_loss = sum([abs(x - y) for x, y in pred_target_dollars])
    print("[RUNNER] Dollars lost={}".format(dollars_loss))
Exemplo n.º 7
0
                print(number_of_edits_previous)
            loopcounter += 1
            print("Number of While Loops: ")

        return edited_train_set.reset_index(drop=True)


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('running edited knn...')
    edited_knn = EditedKNN()

    data_api_impl = DataApi('../../data/')
    cross_validator_impl = CrossValidator()
    preprocessor_impl = Preprocessor()

    wine_data = data_api_impl.get_raw_data_frame('segmentation')
    prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(
        wine_data, 'segmentation')

    wine_data_train_set = cross_validator_impl.get_training_set(
        prep_wine_data, test_set_number=3)
    print('wine_data_train_set.shape: ' + str(wine_data_train_set.shape))

    wine_data_test_set = cross_validator_impl.get_test_set(
        prep_wine_data, test_set_number, indexes_list)

    edited_knn.enn(wine_data_train_set, wine_data_test_set, prep_wine_data, k)
Exemplo n.º 8
0
class CrossValidatorTests(unittest.TestCase):


	# SETUP
	

	@classmethod
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()


	@classmethod
	def tearDownClass(self):
		pass
		

	# TESTS

	'''
	# test get indexes list for abalone data
	def test_get_indexes_list_abalone_data(self):
		abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
		self.assertTrue(abalone_data is not None)
		abalone_indexes = self.cross_validator_impl.get_indexes_list(abalone_data)
		self.assertTrue(len(abalone_indexes) == 4177) # 4177 rows in abalone data frame
		for i in range(1, 10):
			self.assertTrue(abalone_indexes.count(i) == 417) # each subset has 417 rows
		self.assertTrue(abalone_indexes.count(10) == 424) # last subset has 417 + remaining...


	# test get indexes list for car data
	def test_get_indexes_list_car_data(self):
		car_data = self.data_api_impl.get_raw_data_frame('car')
		self.assertTrue(car_data is not None)
		car_indexes = self.cross_validator_impl.get_indexes_list(car_data)
		self.assertTrue(len(car_indexes) == 1728) # 1728 rows in car data frame
		for i in range(1, 10):
			self.assertTrue(car_indexes.count(i) == 172) # each subset has 172 rows
		self.assertTrue(car_indexes.count(10) == 180) # last subset has 172 + remaining...


	# test get indexes list for forest fires data
	def test_get_indexes_list_ff_data(self):
		ff_data = self.data_api_impl.get_raw_data_frame('forestfires')
		self.assertTrue(ff_data is not None)
		ff_indexes = self.cross_validator_impl.get_indexes_list(ff_data)
		self.assertTrue(len(ff_indexes) == 518) # 518 rows in forest fires data frame
		for i in range(1, 10):
			self.assertTrue(ff_indexes.count(i) == 51) # each subset has 51 rows
		self.assertTrue(ff_indexes.count(10) == 59) # last subset has 51 + remaining...


	# test get indexes list for machine data
	def test_get_indexes_list_machine_data(self):
		machine_data = self.data_api_impl.get_raw_data_frame('machine')
		self.assertTrue(machine_data is not None)
		machine_indexes = self.cross_validator_impl.get_indexes_list(machine_data)
		self.assertTrue(len(machine_indexes) == 209) # 209 rows in machine data frame
		for i in range(1, 10):
			self.assertTrue(machine_indexes.count(i) == 20) # each subset has 20 rows
		self.assertTrue(machine_indexes.count(10) == 29) # last subset has 20 + remaining...


	# test get indexes list for segmentation data
	def test_get_indexes_list_segmentation_data(self):
		segmentation_data = self.data_api_impl.get_raw_data_frame('segmentation')
		self.assertTrue(segmentation_data is not None)
		segmentation_indexes = self.cross_validator_impl.get_indexes_list(segmentation_data)
		self.assertTrue(len(segmentation_indexes) == 213) # 213 rows in segmentation data frame
		for i in range(1, 10):
			self.assertTrue(segmentation_indexes.count(i) == 21) # each subset has 21 rows
		self.assertTrue(segmentation_indexes.count(10) == 24) # last subset has 21 + remaining...


	# test get indexes list for wine data
	def test_get_indexes_list_wine_data(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		self.assertTrue(wine_data is not None)
		wine_indexes = self.cross_validator_impl.get_indexes_list(wine_data)
		self.assertTrue(len(wine_indexes) == 6497) # 6497 rows in wine data frame
		for i in range(1, 10):
			self.assertTrue(wine_indexes.count(i) == 649) # each subset has 649 rows
		self.assertTrue(wine_indexes.count(10) == 656) # last subset has 649 + remaining...


	# TRAINING SET


	# test get training set 2 with wine data
	def test_get_training_set(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		wine_data_training_set = self.cross_validator_impl.get_training_set(wine_data, 2)
		self.assertTrue(wine_data_training_set.shape[0] == 5848) # 6497 - 649 rows in test set 2 means 5484 rows in training set
		self.assertTrue(wine_data_training_set.shape[1] == 12) # number of columns does not change


	# TEST SET


	# test get test set (-2) with wine data
	def test_get_test_set(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		wine_data_test_set = self.cross_validator_impl.get_test_set(wine_data, 2)
		self.assertTrue(wine_data_test_set.shape[0] == 649) # 649 rows in test set 2
		self.assertTrue(wine_data_test_set.shape[1] == 12) # number of columns does not change
	'''

	def test_cv_partitions(self):
		abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
		prep_abalone_data = self.preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone')
		cv_partitions = self.cross_validator_impl.get_cv_partitions(prep_abalone_data)

		self.assertTrue(cv_partitions is not None)

		for partition in cv_partitions:
			train_data_indexes = list(cv_partitions[partition][0].index.values)
			test_data_indexes = list(cv_partitions[partition][1].index.values)
			for test_index in test_data_indexes:
				self.assertTrue(test_index not in train_data_indexes)
class ExperimentRunner:
    '''
    CONSTRUCTOR
    '''
    def __init__(self):
        # logger instance - VERBOSE level is highest (most verbose) level for logging
        self.logger = Logger('DEMO')  # configure log level here

        # datalayer instance - read csv data files and convert into raw data frames
        self.datalayer = DataApi('../../data/')
        # preprocessor instance - everything for prerocessing data frames
        self.preprocessor = Preprocessor()
        # cross_validator instance - setup cross validation partitions
        self.cross_validator = CrossValidator()
        # utils instance - random things
        self.utils = Utils()

    # get average result given cross validation results dictionary
    def get_avg_result(self, cv_results):
        result_vals = []
        # for each cross validation partition, append result value to corresponding list
        for test_data_key in cv_results:
            test_result = cv_results[test_data_key]
            result_vals.append(test_result)

        # should always equal the value of the 'folds' variable in cross validator
        test_data_count = len(cv_results)
        # calculate average values
        avg_result = sum(result_vals) / test_data_count
        # return average result
        return avg_result

    '''
    get preprocessed data ready for consumption by experiment running logic

    INPUT:
        - data_set_name: name of data set to fetch data for

    OUTPUT:
        - preprocessed data frame - fully ready for experiment consumption
    '''

    def get_experiment_data(self, data_set_name):
        data = self.datalayer.get_raw_data_frame(data_set_name)
        self.logger.log('DEMO', 'data_set_name: \t%s\n' % str(data_set_name))
        self.logger.log(
            'DEMO',
            'raw data: \n\n%s, shape: %s\n' % (str(data), str(data.shape)))
        self.logger.log('DEMO', '----------------------------------------------------' \
                                    + '-----------------------------------------------\n')
        data = self.preprocessor.preprocess_raw_data_frame(data, data_set_name)
        self.logger.log(
            'DEMO', 'preprocessed data: \n\n%s, shape: %s\n' %
            (str(data), str(data.shape)))
        self.logger.log('DEMO', '----------------------------------------------------' \
                                    + '-----------------------------------------------\n')
        return data

    '''
    run experiment

    INPUT:
        - data_set_name: name of data set to run experiment on
        - neural_network: instance of neural network to train/test with data
        - hyperparams: hyperparameters and corresponding values to use in experiment

    OUTPUT:
        - <void> - logs all the important stuff at DEMO level
    '''

    def run_experiment(self, data_set_name, neural_network, hyperparams):

        # LAYER ACTIVATION FUNCTION SPECIFICATION

        self.logger.log(
            'DEMO', 'layer_activation_funcs: %s\n' %
            str(hyperparams["layer_activation_funcs"]))

        # DATA RETRIEVAL AND PREPROCESSING

        data = self.get_experiment_data(data_set_name)

        self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name))

        # CROSS VALIDATION PARTITIONING

        # get cross validation partitions for data
        cv_partitions = self.cross_validator.get_cv_partitions(data)

        # dictionary for storing accuracy results
        cv_results = {}
        # list of sizes of test sets used for getting average test set size
        test_data_sizes = []

        # NEURAL NETWORK TRAINING AND TESTING

        for partition in cv_partitions:
            # initialize key and corresponding nested dictionary in results dictionary
            test_data_key = 'test_data_' + str(partition)
            cv_results[test_data_key] = {}
            # get training set and test set for given cross validation partition
            train_data, test_data = cv_partitions[partition]
            test_data_sizes.append(
                test_data.shape[0]
            )  # add number of rows in test set to test_set_sizes list

            # HANDLE RBF NETWORK P2 RESULTS

            if neural_network.network_name == 'RBF':
                # configure RBF network shape based on training data
                neural_network.configure_rbf_network(train_data, data,
                                                     data_set_name,
                                                     hyperparams["k"])

            # GRADIENT DESCENT

            # run gradient descent for given neural network instance
            test_result_vals = neural_network.train_gradient_descent(
                train_data, hyperparams, partition, test_data)

            self.logger.log('DEMO', ('accuracy_vals' if neural_network.CLASSIFICATION else 'error_vals') \
                + ' for partition %s: %s\n' % (str(partition+1), str(test_result_vals)), True)

            # append accuracy/error result of final gradient descent iteration to results dictionary
            cv_results[test_data_key] = test_result_vals[-1]

        # FINAL RESULTS (THE MODEL)

        self.logger.log('DEMO', '------------------------------------------------------------' \
                + ' TRAINING DONE ------------------------------------------------------------')

        self.logger.log('DEMO', 'trained network: weights --> \n\n%s, shapes: %s\n' \
            % (str(neural_network.weights), str(self.utils.get_shapes(neural_network.weights))), True)

        self.logger.log('DEMO', 'trained network: biases --> \n\n%s, shapes: %s\n' \
            % (str(neural_network.biases), str(self.utils.get_shapes(neural_network.biases))), True)

        self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name),
                        True)

        self.logger.log('DEMO', 'trained network: AVERAGE ' \
            + ('ACCURACY' if neural_network.CLASSIFICATION else 'ERROR') + ' --> %s\n' \
            % str(self.get_avg_result(cv_results)), True)
        print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count))
        return centroids_data



# EXECUTE SCRIPT


if __name__ == '__main__':

    print('k means clustering...')
    k_means_clustering_impl = KMeansClustering()

    data_api_impl = DataApi('../../data/')
    preprocessor_impl = Preprocessor()
    cross_validator_impl = CrossValidator()

    '''
    wine_data = data_api_impl.get_raw_data_frame('wine')
    prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(wine_data, 'wine')
    '''

    abalone_data = data_api_impl.get_raw_data_frame('abalone')
    prep_abalone_data = preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone')

    print('\npossible classes: ' + str(list(set(abalone_data.loc[:, 'CLASS'].values))) + '\n')

    training_set, test_set = cross_validator_impl.get_cv_partitions(prep_abalone_data)[0]

    # get training set (full data frame - rows in test_set_index bucket)
    #training_set = cross_validator_impl.get_training_set(prep_abalone_data, test_set_number=3)
    def get_data_set(self):
        return self.data_set

    # set algorithm name for context
    def set_algorithm_name(self, algorithm_name):
        self.algorithm_name = algorithm_name


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('\nk nearest neighbor...\n')

    data_api_impl = DataApi('../../data/')
    cross_validator_impl = CrossValidator()
    preprocessor_impl = Preprocessor()

    knn_impl = KNN()

    segmentation_data = data_api_impl.get_raw_data_frame('segmentation')
    segmentation_data_preproc = preprocessor_impl.preprocess_raw_data_frame(
        segmentation_data, "segmentation")

    distance_matrix = knn_impl.get_distance_matrix(segmentation_data_preproc)
    print("Segmentation Data Preprocessed: ")
    print(segmentation_data_preproc)
    print(
        "--------------------------------------------------------------------------------------"
    )
    knn = knn_impl.knn(10, segmentation_data_preproc, distance_matrix, 5)
Exemplo n.º 12
0
class ExperimentRunner():


    def __init__(self):
        self.DEBUG = False

        # get instances of all the classes needed to run an experiment
        self.data_api_impl = DataApi('../../data/')
        self.preprocessor_impl = Preprocessor()
        self.cross_validator_impl = CrossValidator()
        self.parameter_tuner_impl = ParameterTuner()

        # algorithm implementations
        self.knn_impl = KNN()
        self.enn_impl = EditedKNN()
        self.cnn_impl = CondensedKNN()
        self.kmeans_knn_impl = KMeansClustering()
        self.k_medoids_clustering_impl = KMedoidsClustering()

        self.results_processor_impl = Results()

        self.CLASSIFICATION = False
        self.REGRESSION = False


    # run algorithm on data set with various parameters
    def run_experiment(self, data_frame_name, algorithm):

        self.set_experiment_type(data_frame_name)

        # get raw data frame to run experiment against
        raw_data_frame = self.data_api_impl.get_raw_data_frame(data_frame_name)
        print(raw_data_frame)

        # preprocess data
        preprocessed_data_frame = self.preprocessor_impl.preprocess_raw_data_frame(raw_data_frame, data_frame_name)
        print(preprocessed_data_frame)

        # get indexes list for data frame cross validation - a list of row numbers used to partition the data
        data_frame_indexes_list = self.cross_validator_impl.get_indexes_list(preprocessed_data_frame)

        if self.DEBUG:
            print('\ndata_frame_name --> ' + data_frame_name)
            print('\nraw_data_frame:\n')
            print(raw_data_frame)
            print('\npreprocessed_data_frame:\n')
            print(preprocessed_data_frame)
            print('\ndata_frame_indexes_list for cross validation:\n')
            print(data_frame_indexes_list)

        # nested dictionary to hold algorithm performance results for each combination of training/test sets
        # key pattern --> key = test_set_1 , where the number at the end of the key is the test set index
        # each value is another dictionary with keys = { 'zero_one_loss', 'mean_squared_error' }
        # the nested dictionary values are the corresponding loss function metrics for predictions using the test set
        cross_validation_results = {}

        # list of sizes of test sets used for getting average test set size
        test_set_sizes = []

        algorithm_parameters = self.parameter_tuner_impl.get_params(data_frame_name, algorithm)
        # dictionary where key is parameter and value is tuple of average loss function results
        results_by_parameter = {}

        # get all cross validation partitions for given data frame
        cv_partitions = self.cross_validator_impl.get_cv_partitions(preprocessed_data_frame)

        # for each parameter value in the list of algorithm parameter values (see ParameterTuner)
        for parameter in algorithm_parameters:

            if self.DEBUG:
                print('\n' + str(self.parameter_tuner_impl.get_parameter_key(algorithm)) + ': ' + str(parameter) + '\n')

            # for each test set used in cross validation (number of folds)
            for partition in cv_partitions:

                # initialize key and corresponding nested dictionary in results dictionary
                test_set_key = 'test_set_' + str(partition)
                cross_validation_results[test_set_key] = {}

                # get training set and test set for given cross validation partition
                training_set, test_set = cv_partitions[partition]

                test_set_sizes.append(test_set.shape[0]) # add number of rows in test set to test_set_sizes list

                if self.DEBUG:
                    print('preprocessed dataframe before running algorithm:')
                    print(preprocessed_data_frame)

                # run algorithms on training set / test set combination
                # returns dictionary where key is the row index (as string) and value is the predicted class for that row
                prediction_results = self.run_algorithm(data_frame_name, algorithm, training_set, test_set, \
                                                            preprocessed_data_frame, parameter)

                # calculate loss function results given prediction results - measure prediction accuracy
                accuracy, mean_squared_error = self.results_processor_impl.loss_function_analysis(test_set, prediction_results)

                cross_validation_results[test_set_key]['accuracy'] = accuracy
                cross_validation_results[test_set_key]['mean_squared_error'] = mean_squared_error

            # calculate average loss function results over all cross validation folds
            avg_accuracy, avg_mean_squared_error = self.results_processor_impl.get_avg_loss_vals(cross_validation_results)
            avg_test_set_size = sum(test_set_sizes) / len(test_set_sizes) # get average test set size for reference

            results_by_parameter[str(parameter)] = (avg_accuracy, avg_mean_squared_error)

            print('\n\nRESULTS: average test set size: ' + str(avg_test_set_size) + \
                ((' --> accuracy: ' + str(avg_accuracy)) if self.CLASSIFICATION \
                else (' --> mean_squared_error: ' + str(avg_mean_squared_error))))

            print('\n---------------------------------------------------------------------------------------------------------------------')

        # return dictionary of results by parameter
        return results_by_parameter


    def set_experiment_type(self, data_frame_name):
        if data_frame_name in ['abalone', 'car', 'segmentation']:
            self.CLASSIFICATION = True
            self.REGRESSION = False
        elif data_frame_name in ['machine', 'forestfires', 'wine']:
            self.REGRESSION = True
            self.CLASSIFICATION = False
        else:
            raise Exception('ERROR: unknown data_set_name --> ' + str(data_frame_name))


    '''
    run algorithm execution handler given algorithm name

    INPUT:
        - algorithm_name: name of algorithm to run handler for

    OUTPUT:
        - prediction results dictionary, maps instance index to tuple: (prediction, actual)
    '''
    def run_algorithm(self, data_set_name, algorithm_name, training_set, \
                        test_set, preprocessed_data_frame, parameter):
        if algorithm == 'knn':
            self.knn_impl.set_data_set(data_set_name)
            self.knn_impl.set_algorithm_name(algorithm_name)
            return self.knn_impl.do_knn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'enn':
            self.enn_impl.set_data_set(data_set_name)
            self.enn_impl.set_algorithm_name(algorithm_name)
            return self.enn_impl.do_enn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'cnn':
            self.cnn_impl.set_data_set(data_set_name)
            self.cnn_impl.set_algorithm_name(algorithm_name)
            return self.cnn_impl.do_cnn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'kmeans_knn':
            self.kmeans_knn_impl.set_data_set(data_set_name)
            self.kmeans_knn_impl.set_algorithm_name(algorithm_name)
            return self.kmeans_knn_impl.cluster_do_knn(training_set, test_set, preprocessed_data_frame, data_set_name, parameter)
        elif algorithm == 'kmedoids_knn':
            self.k_medoids_clustering_impl.set_data_set(data_set_name)
            self.k_medoids_clustering_impl.set_algorithm_name(algorithm_name)
            return self.k_medoids_clustering_impl.cluster(training_set, test_set, preprocessed_data_frame, data_set_name, parameter)
Exemplo n.º 13
0
# create_plot(tree)

# Prune the training set.
pruned_tree = TreePruner(tree).prune()
create_plot(pruned_tree)
print('Tree depth: ', get_tree_depth(tree))

# Classify other results
c = Classifier(pruned_tree, short_labels)

print('\nClassify the training set: ')
dsc = DataSetClassifier(c, enricher)
results = dsc.classify_data_set(original_data_set)

print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:',
      len(results), '\nError:',
      str(round(dsc.error_rate, 2)) + '%')

print('\nClassify the test set: ')
testing_data_set = DataSetLoader('dataset/test.data').load()
results = dsc.classify_data_set(testing_data_set)
print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:',
      len(results), '\nError:',
      str(round(dsc.error_rate, 2)) + '%\n')

print('Limiting depth:')
CrossValidator([
    'dataset/cvs_splits/training00.data', 'dataset/cvs_splits/training01.data',
    'dataset/cvs_splits/training02.data', 'dataset/cvs_splits/training03.data'
]).run()
Exemplo n.º 14
0
    train_data_player = DataFramePlayer.load_csv(train_data_path)
    label_data_player = DataFramePlayer.load_csv(label_data_path)

    # プレイヤーを使った加工の処理
    # playerにカセットをセットして、play()することで、加工が行われます。
    # 加工結果はプレイヤー内部のデータフレームに保持されます。
    label_data_player.add(CleanLabelCassette).play()

    # カセット単体でも使用することが出来ます
    train_data_mean = MeanCassette.extract(train_data_player.df)

    spilt = 5

    # クロスバリデーションの設定
    validator = CrossValidator(objective=__objective,
                               spilt=spilt,
                               train_data=train_data_player.df,
                               label_data=label_data_player.df)

    feature_columns = train_data_player.df.columns

    sub_predicts = pd.DataFrame()
    # クロスバリデータをforで回すことで、計算objectveの結果だけをイテレーションごとに取り出すことが出来ます。
    for folds, clf in validator:
        predicts = clf.predict_proba(
            train_data_player.df, num_iteration=clf.best_iteration_)[:,
                                                                     1] / spilt
        fold_importance_df = lgbexe.analyze_lightgbm(clf, feature_columns)

    # プレイヤーを通じて内部のデータフレームをcsv形式で保存することが出来ます
    DataFramePlayer(sub_predicts).save_csv('result', '.', is_attend_date=True)