Пример #1
0
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    result_path = args.result_path
    feature_filter = args.feature_filter

    data = pd.read_csv(training_set_path, index_col=0)

    paths = [
        test_path + catalog + '_sampled_' + str(i) + '.csv'
        for i in xrange(100)
    ]

    # Necesito asegurarme de que las curvas sean las mismas en train y test
    test_data = pd.read_csv(paths[0], index_col=0)
    data, test_data = utils.equalize_indexes(data, test_data)

    data, y = utils.filter_data(data, feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:

        train_X, train_y = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators,
                                     criterion=criterion,
                                     max_depth=max_depth,
Пример #2
0
# coding=utf-8

# Script solo pa recordar como separe los sets de testing para eros y macho. No esta 
# pensado para correrse normalmente

# -----------------------------------------------------------------------------

from sklearn import cross_validation
import pandas as pd

import utils

catalog = 'EROS'
test_size = 5000

normal_df = pd.read_csv('/n/home09/ncastro/workspace/Features/sets/' + catalog + '/' + catalog + '_regular_set_5.csv', index_col=0)
sample_df = pd.read_csv('/n/seasfs03/IACS/TSC/ncastro/sets/' + catalog + '_Sampled/uniform/5%/' + catalog + '_sampled_0.csv', index_col=0)

a, b = utils.equalize_indexes(normal_df, sample_df)

sss = cross_validation.StratifiedShuffleSplit(a['class'], n_iter=1, test_size=test_size, 
											  random_state=1)

for train_index, test_index in sss:
	train_df = a.iloc[train_index]
	test_df = a.iloc[test_index]

a.to_csv('/n/home09/ncastro/workspace/Features/sets/Common/' + catalog + '.csv')
test_df.to_csv('/n/home09/ncastro/workspace/Features/sets/Common/' + catalog + '_test.csv')
Пример #3
0
    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split

    feature_filter = args.feature_filter
    index_filter = args.index_filter

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    train_data = pd.read_csv(train_path, index_col=0)
    test_data = pd.read_csv(test_path, index_col=0)

    train_data, test_data = utils.equalize_indexes(train_data, test_data)

    train_X, train_y = utils.filter_data(train_data, index_filter=index_filter, feature_filter=feature_filter)
    test_X, test_y = utils.filter_data(test_data, index_filter=index_filter, feature_filter=feature_filter)

    # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos
    # Y es valido ocuparlos solo por posicion
    skf = cross_validation.StratifiedKFold(test_y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux
Пример #4
0
    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split

    feature_filter = args.feature_filter
    index_filter = args.index_filter

    if index_filter is not None:
        index_filter = pd.read_csv(index_filter, index_col=0).index

    train_data = pd.read_csv(train_path, index_col=0)
    test_data = pd.read_csv(test_path, index_col=0)

    train_data, test_data = utils.equalize_indexes(train_data, test_data)

    train_X, train_y = utils.filter_data(train_data,
                                         index_filter=index_filter,
                                         feature_filter=feature_filter)
    test_X, test_y = utils.filter_data(test_data,
                                       index_filter=index_filter,
                                       feature_filter=feature_filter)

    # Ocupo solo los datos de test para hacer el k-fold, por que estos no estan repetidos
    # Y es valido ocuparlos solo por posicion
    skf = cross_validation.StratifiedKFold(test_y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:
Пример #5
0
    n_estimators = args.n_estimators
    criterion = args.criterion
    max_depth = args.max_depth
    min_samples_split = args.min_samples_split
    result_path = args.result_path
    feature_filter = args.feature_filter



    data = pd.read_csv(training_set_path, index_col=0)
    
    paths = [test_path + catalog + '_sampled_' + str(i) + '.csv' for i in xrange(100)]

    # Necesito asegurarme de que las curvas sean las mismas en train y test
    test_data = pd.read_csv(paths[0], index_col=0)
    data, test_data = utils.equalize_indexes(data, test_data)

    data, y = utils.filter_data(data, feature_filter=feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    results = []
    ids = []

    for train_index, test_index in skf:

        train_X, train_y  = data.iloc[train_index], y.iloc[train_index]

        clf = None
        clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                     max_depth=max_depth, min_samples_split=min_samples_split,
                                     n_jobs=n_processes)