示例#1
0
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()
示例#2
0
def main():
    parser = argparse.ArgumentParser(
        description='Multilayer neural network parser')
    parser.add_argument('-d',
                        '--dataset',
                        help='The name (without extension) of the dataset',
                        required=True)
    parser.add_argument('-n',
                        '--network',
                        help='The filename of the network configuration',
                        required=False)
    args = parser.parse_args()

    try:
        with open(DATA_PATH + args.dataset + '.json', 'r') as filetypes:
            types = json.load(filetypes)
    except:
        print('Dataset types not found, automatic types will be used.')
        types = {}

    df = preprocess(
        pd.read_csv(DATA_PATH + args.dataset + '.tsv', sep='\t', dtype=types),
        types)
    network_file = open(args.network, 'r')
    cv = CrossValidator(
        NeuralNetwork(network_file=network_file,
                      target_attribute='target',
                      data_instance=df.iloc[0]))
    cv.cross_validate(df, 10, 1)
    def __init__(self):
        # logger instance - VERBOSE level is highest (most verbose) level for logging
        self.logger = Logger('DEMO')  # configure log level here

        # datalayer instance - read csv data files and convert into raw data frames
        self.datalayer = DataApi('../../data/')
        # preprocessor instance - everything for prerocessing data frames
        self.preprocessor = Preprocessor()
        # cross_validator instance - setup cross validation partitions
        self.cross_validator = CrossValidator()
        # utils instance - random things
        self.utils = Utils()
示例#4
0
def main():
    parser = argparse.ArgumentParser(description='Random Forest parser')
    parser.add_argument('--opt', help='test-benchmark or test-dataset.', required=True)
    parser.add_argument('--dataset', help='The dataset filename.', default='', required=False)
    parser.add_argument('--target_attribute', help='Target attribute to be predicted.', default='', required=False)
    parser.add_argument('--n_trees', help='The number of trees. The default is 5.', default=5, type=int, required=False)
    parser.add_argument('--n_attributes', help='The number of attributes. The default is the squared root of otal attributes.', default=-1, type=int, required=False)
    parser.add_argument('--k_folds', help='The number of folds for cross validation. The default is 5', default=5, type=int, required=False)
    parser.add_argument('--r', help='The number of repetitions for repeated cross validation. The default is 1', default=1, type=int, required=False)
    args = parser.parse_args()

    if args.opt == 'test-benchmark':
        test_benchmark_categorical()
        test_benchmark_numerical()

    if args.opt == 'test-dataset':
        if args.dataset == '' or not os.path.isfile(DATA_PATH + args.dataset):
            print('Dataset not found.')
            return

        try:
            with open(DATA_PATH + args.dataset[:-3] + 'json', 'r') as filetypes:
                types = json.load(filetypes)
        except:
            print('Dataset types not found, automatic types will be used.')
            types = {}

        data = pd.read_csv(
            DATA_PATH + args.dataset,
            delimiter='\t' if args.dataset[-3:] == 'tsv' else ',',
            dtype=types
        )

        if args.target_attribute not in data.columns:
            print("Target attribute doesn't exist on dataset.")
            return

        n_trees = args.n_trees
        n_random_attributes = args.n_attributes
        if n_random_attributes == -1:
            n_random_attributes = int((len(data.columns) - 1) ** 1/2)

        cv = CrossValidator(
            RandomForest(n_trees, args.target_attribute, n_random_attributes)
        )
        cv.cross_validate(data, args.k_folds, args.r)
        print('\nGlobal accuracy: %.3f (%.3f)' % (cv.accuracy, cv.accuracy_std))
示例#5
0
    def __init__(self):
        self.DEBUG = False

        # get instances of all the classes needed to run an experiment
        self.data_api_impl = DataApi('../../data/')
        self.preprocessor_impl = Preprocessor()
        self.cross_validator_impl = CrossValidator()
        self.parameter_tuner_impl = ParameterTuner()

        # algorithm implementations
        self.knn_impl = KNN()
        self.enn_impl = EditedKNN()
        self.cnn_impl = CondensedKNN()
        self.kmeans_knn_impl = KMeansClustering()
        self.k_medoids_clustering_impl = KMedoidsClustering()

        self.results_processor_impl = Results()

        self.CLASSIFICATION = False
        self.REGRESSION = False
示例#6
0
def test():
    lvl = 1
    wavelet = 'db4'  # Haar'
    ts_file_name = 'ford_ts.csv'
    last_days = 1200
    time_frame = 60
    time_bias = 1

    data_loader = DataLoader(ts_file_name, last_days, debug=True)

    raw_data = data_loader.as_matrix()
    ts_data = denoise(raw_data, lvl, wavelet)

    # plt.plot(raw_data[3])
    # plt.show()
    # plt.plot(ts_data[3])
    # plt.show()

    daily_features, _ = np.shape(ts_data)
    dataset = data_loader.prepare_dataset_sae(ts_data, time_frame, time_bias)

    runner = Runner(daily_features,
                    lstm_layers=1,
                    gamma=0.005,
                    delay=4,
                    sae_lr=0.01,
                    beta=0,
                    hidden_nodes_activation_rate=0.9,
                    hidden_layers_sizes=[8],
                    debug=True)

    cross_validator = CrossValidator()
    pred_target = cross_validator.run_validation(runner,
                                                 dataset,
                                                 sae_epoch=1,
                                                 lstm_epoch=1)
    pred_target_dollars = [(data_loader.to_dolar(x), data_loader.to_dolar(y))
                           for x, y in pred_target]
    dollars_loss = sum([abs(x - y) for x, y in pred_target_dollars])
    print("[RUNNER] Dollars lost={}".format(dollars_loss))
示例#7
0
                print(number_of_edits_previous)
            loopcounter += 1
            print("Number of While Loops: ")

        return edited_train_set.reset_index(drop=True)


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('running edited knn...')
    edited_knn = EditedKNN()

    data_api_impl = DataApi('../../data/')
    cross_validator_impl = CrossValidator()
    preprocessor_impl = Preprocessor()

    wine_data = data_api_impl.get_raw_data_frame('segmentation')
    prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(
        wine_data, 'segmentation')

    wine_data_train_set = cross_validator_impl.get_training_set(
        prep_wine_data, test_set_number=3)
    print('wine_data_train_set.shape: ' + str(wine_data_train_set.shape))

    wine_data_test_set = cross_validator_impl.get_test_set(
        prep_wine_data, test_set_number, indexes_list)

    edited_knn.enn(wine_data_train_set, wine_data_test_set, prep_wine_data, k)
示例#8
0
# create_plot(tree)

# Prune the training set.
pruned_tree = TreePruner(tree).prune()
create_plot(pruned_tree)
print('Tree depth: ', get_tree_depth(tree))

# Classify other results
c = Classifier(pruned_tree, short_labels)

print('\nClassify the training set: ')
dsc = DataSetClassifier(c, enricher)
results = dsc.classify_data_set(original_data_set)

print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:',
      len(results), '\nError:',
      str(round(dsc.error_rate, 2)) + '%')

print('\nClassify the test set: ')
testing_data_set = DataSetLoader('dataset/test.data').load()
results = dsc.classify_data_set(testing_data_set)
print('Invalid classified entries:', dsc.invalid_entries, '\nTotal entries:',
      len(results), '\nError:',
      str(round(dsc.error_rate, 2)) + '%\n')

print('Limiting depth:')
CrossValidator([
    'dataset/cvs_splits/training00.data', 'dataset/cvs_splits/training01.data',
    'dataset/cvs_splits/training02.data', 'dataset/cvs_splits/training03.data'
]).run()
示例#9
0
    train_data_player = DataFramePlayer.load_csv(train_data_path)
    label_data_player = DataFramePlayer.load_csv(label_data_path)

    # プレイヤーを使った加工の処理
    # playerにカセットをセットして、play()することで、加工が行われます。
    # 加工結果はプレイヤー内部のデータフレームに保持されます。
    label_data_player.add(CleanLabelCassette).play()

    # カセット単体でも使用することが出来ます
    train_data_mean = MeanCassette.extract(train_data_player.df)

    spilt = 5

    # クロスバリデーションの設定
    validator = CrossValidator(objective=__objective,
                               spilt=spilt,
                               train_data=train_data_player.df,
                               label_data=label_data_player.df)

    feature_columns = train_data_player.df.columns

    sub_predicts = pd.DataFrame()
    # クロスバリデータをforで回すことで、計算objectveの結果だけをイテレーションごとに取り出すことが出来ます。
    for folds, clf in validator:
        predicts = clf.predict_proba(
            train_data_player.df, num_iteration=clf.best_iteration_)[:,
                                                                     1] / spilt
        fold_importance_df = lgbexe.analyze_lightgbm(clf, feature_columns)

    # プレイヤーを通じて内部のデータフレームをcsv形式で保存することが出来ます
    DataFramePlayer(sub_predicts).save_csv('result', '.', is_attend_date=True)