def setUpClass(self): self.DEBUG = False self.METRICS = False self.data_api_impl = DataApi('../../../data/') self.cross_validator_impl = CrossValidator() self.preprocessor_impl = Preprocessor()
def main(): parser = argparse.ArgumentParser( description='Multilayer neural network parser') parser.add_argument('-d', '--dataset', help='The name (without extension) of the dataset', required=True) parser.add_argument('-n', '--network', help='The filename of the network configuration', required=False) args = parser.parse_args() try: with open(DATA_PATH + args.dataset + '.json', 'r') as filetypes: types = json.load(filetypes) except: print('Dataset types not found, automatic types will be used.') types = {} df = preprocess( pd.read_csv(DATA_PATH + args.dataset + '.tsv', sep='\t', dtype=types), types) network_file = open(args.network, 'r') cv = CrossValidator( NeuralNetwork(network_file=network_file, target_attribute='target', data_instance=df.iloc[0])) cv.cross_validate(df, 10, 1)
def __init__(self): # logger instance - VERBOSE level is highest (most verbose) level for logging self.logger = Logger('DEMO') # configure log level here # datalayer instance - read csv data files and convert into raw data frames self.datalayer = DataApi('../../data/') # preprocessor instance - everything for prerocessing data frames self.preprocessor = Preprocessor() # cross_validator instance - setup cross validation partitions self.cross_validator = CrossValidator() # utils instance - random things self.utils = Utils()
def main(): parser = argparse.ArgumentParser(description='Random Forest parser') parser.add_argument('--opt', help='test-benchmark or test-dataset.', required=True) parser.add_argument('--dataset', help='The dataset filename.', default='', required=False) parser.add_argument('--target_attribute', help='Target attribute to be predicted.', default='', required=False) parser.add_argument('--n_trees', help='The number of trees. The default is 5.', default=5, type=int, required=False) parser.add_argument('--n_attributes', help='The number of attributes. The default is the squared root of otal attributes.', default=-1, type=int, required=False) parser.add_argument('--k_folds', help='The number of folds for cross validation. The default is 5', default=5, type=int, required=False) parser.add_argument('--r', help='The number of repetitions for repeated cross validation. The default is 1', default=1, type=int, required=False) args = parser.parse_args() if args.opt == 'test-benchmark': test_benchmark_categorical() test_benchmark_numerical() if args.opt == 'test-dataset': if args.dataset == '' or not os.path.isfile(DATA_PATH + args.dataset): print('Dataset not found.') return try: with open(DATA_PATH + args.dataset[:-3] + 'json', 'r') as filetypes: types = json.load(filetypes) except: print('Dataset types not found, automatic types will be used.') types = {} data = pd.read_csv( DATA_PATH + args.dataset, delimiter='\t' if args.dataset[-3:] == 'tsv' else ',', dtype=types ) if args.target_attribute not in data.columns: print("Target attribute doesn't exist on dataset.") return n_trees = args.n_trees n_random_attributes = args.n_attributes if n_random_attributes == -1: n_random_attributes = int((len(data.columns) - 1) ** 1/2) cv = CrossValidator( RandomForest(n_trees, args.target_attribute, n_random_attributes) ) cv.cross_validate(data, args.k_folds, args.r) print('\nGlobal accuracy: %.3f (%.3f)' % (cv.accuracy, cv.accuracy_std))
def __init__(self): self.DEBUG = False # get instances of all the classes needed to run an experiment self.data_api_impl = DataApi('../../data/') self.preprocessor_impl = Preprocessor() self.cross_validator_impl = CrossValidator() self.parameter_tuner_impl = ParameterTuner() # algorithm implementations self.knn_impl = KNN() self.enn_impl = EditedKNN() self.cnn_impl = CondensedKNN() self.kmeans_knn_impl = KMeansClustering() self.k_medoids_clustering_impl = KMedoidsClustering() self.results_processor_impl = Results() self.CLASSIFICATION = False self.REGRESSION = False
def test(): lvl = 1 wavelet = 'db4' # Haar' ts_file_name = 'ford_ts.csv' last_days = 1200 time_frame = 60 time_bias = 1 data_loader = DataLoader(ts_file_name, last_days, debug=True) raw_data = data_loader.as_matrix() ts_data = denoise(raw_data, lvl, wavelet) # plt.plot(raw_data[3]) # plt.show() # plt.plot(ts_data[3]) # plt.show() daily_features, _ = np.shape(ts_data) dataset = data_loader.prepare_dataset_sae(ts_data, time_frame, time_bias) runner = Runner(daily_features, lstm_layers=1, gamma=0.005, delay=4, sae_lr=0.01, beta=0, hidden_nodes_activation_rate=0.9, hidden_layers_sizes=[8], debug=True) cross_validator = CrossValidator() pred_target = cross_validator.run_validation(runner, dataset, sae_epoch=1, lstm_epoch=1) pred_target_dollars = [(data_loader.to_dolar(x), data_loader.to_dolar(y)) for x, y in pred_target] dollars_loss = sum([abs(x - y) for x, y in pred_target_dollars]) print("[RUNNER] Dollars lost={}".format(dollars_loss))
print(number_of_edits_previous) loopcounter += 1 print("Number of While Loops: ") return edited_train_set.reset_index(drop=True) # EXECUTE SCRIPT if __name__ == '__main__': print('running edited knn...') edited_knn = EditedKNN() data_api_impl = DataApi('../../data/') cross_validator_impl = CrossValidator() preprocessor_impl = Preprocessor() wine_data = data_api_impl.get_raw_data_frame('segmentation') prep_wine_data = preprocessor_impl.preprocess_raw_data_frame( wine_data, 'segmentation') wine_data_train_set = cross_validator_impl.get_training_set( prep_wine_data, test_set_number=3) print('wine_data_train_set.shape: ' + str(wine_data_train_set.shape)) wine_data_test_set = cross_validator_impl.get_test_set( prep_wine_data, test_set_number, indexes_list) edited_knn.enn(wine_data_train_set, wine_data_test_set, prep_wine_data, k)
# create_plot(tree) # Prune the training set. pruned_tree = TreePruner(tree).prune() create_plot(pruned_tree) print('Tree depth: ', get_tree_depth(tree)) # Classify other results c = Classifier(pruned_tree, short_labels) print('\nClassify the training set: ') dsc = DataSetClassifier(c, enricher) results = dsc.classify_data_set(original_data_set) print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:', len(results), '\nError:', str(round(dsc.error_rate, 2)) + '%') print('\nClassify the test set: ') testing_data_set = DataSetLoader('dataset/test.data').load() results = dsc.classify_data_set(testing_data_set) print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:', len(results), '\nError:', str(round(dsc.error_rate, 2)) + '%\n') print('Limiting depth:') CrossValidator([ 'dataset/cvs_splits/training00.data', 'dataset/cvs_splits/training01.data', 'dataset/cvs_splits/training02.data', 'dataset/cvs_splits/training03.data' ]).run()
train_data_player = DataFramePlayer.load_csv(train_data_path) label_data_player = DataFramePlayer.load_csv(label_data_path) # プレイヤーを使った加工の処理 # playerにカセットをセットして、play()することで、加工が行われます。 # 加工結果はプレイヤー内部のデータフレームに保持されます。 label_data_player.add(CleanLabelCassette).play() # カセット単体でも使用することが出来ます train_data_mean = MeanCassette.extract(train_data_player.df) spilt = 5 # クロスバリデーションの設定 validator = CrossValidator(objective=__objective, spilt=spilt, train_data=train_data_player.df, label_data=label_data_player.df) feature_columns = train_data_player.df.columns sub_predicts = pd.DataFrame() # クロスバリデータをforで回すことで、計算objectveの結果だけをイテレーションごとに取り出すことが出来ます。 for folds, clf in validator: predicts = clf.predict_proba( train_data_player.df, num_iteration=clf.best_iteration_)[:, 1] / spilt fold_importance_df = lgbexe.analyze_lightgbm(clf, feature_columns) # プレイヤーを通じて内部のデータフレームをcsv形式で保存することが出来ます DataFramePlayer(sub_predicts).save_csv('result', '.', is_attend_date=True)