def recursive_tree_exploration(feature_set, past_accuracy, past_model):
    last_added = feature_set[len(feature_set) - 1]
    new_feature_set = copy.copy(feature_set)
    #POR QUe AGREGAR 0 ??
    ## COUNT 1 APPEND
    new_feature_set.append(0)

    max_accuracy = past_accuracy
    best_set = feature_set
    best_model = past_model
    for iter in xrange(last_added + 1, feature_amount):
        ## COUNT 1 POP
        new_feature_set.pop()
        ## COUNT 1 APPEND
        new_feature_set.append(iter)
        new_x_norm, y, _, _ = getDataSubSet(new_feature_set)
        model, calc_best_model, calc_accuracy = manual_cross_validation(
            new_x_norm, y, models, names, True)
        if calc_accuracy > max_accuracy:
            max_accuracy = calc_accuracy
            best_set = new_feature_set
            best_model = calc_best_model
            calc_accuracy, calc_best_set, calc_best_model = recursive_tree_exploration(
                new_feature_set, max_accuracy, calc_best_model)
            if calc_accuracy > max_accuracy:
                max_accuracy = calc_accuracy
                best_set = calc_best_set
                best_model = calc_best_model

    print best_set
    print "BEST SET ACCURACY: " + str(max_accuracy)
    print "BEST SET MODEL: " + str(best_model)
    return max_accuracy, best_set, best_model
def tree_selection_heuristic():
    # The feature set that is going to be evaluated
    feature_set = [0]
    max_accuracy = 0.0
    best_set = []
    best_model = None

    # For every set of 1 feature the recursive search is applied
    for i in xrange(1, feature_amount):
        feature_set.pop()
        feature_set.append(i)
        new_x_norm, y, _, _ = getDataSubSet(feature_set)
        #INIT FOR RECURSIVE CALL
        model, calc_best_model, calc_accuracy = manual_cross_validation(
            new_x_norm, y, models, names, True)
        #RECURSIVE CALL
        calc_accuracy, calc_best_set, calc_best_model = recursive_tree_exploration(
            feature_set, calc_accuracy, calc_best_model)
        if calc_accuracy > max_accuracy:
            max_accuracy = calc_accuracy
            best_set = calc_best_set
            best_model = calc_best_model

    print "-----------------------------------"
    print "---------BEST FEATURE SET----------"
    print best_set

    print "-----------------------------------"
    print "-------------ACCURACY--------------"
    print max_accuracy

    return best_model
    size = pow(2,n)
    # Vars to select the best suited model
    bestModel = None
    bestName = "none"
    bestMean = 0.0
    bestSet = []
    for ii in xrange(size):
        subset = []
        for jj in xrange(n):
            if (ii & (1 << jj)) > 0:
                subset.append(set[jj])
        print subset
        if (len(subset)>0):
            try:

                X,y,X_test,Y_test= getDataSubSet(subset)
                scaler = preprocessing.MinMaxScaler()
                scaler.fit(X)
                x_norm=scaler.transform(X)

                #initialize models
                forest= RandomForestClassifier(n_estimators=100, max_depth=20, random_state=111)
                gdBoost = GradientBoostingClassifier(random_state=111)
                mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=111)
                models=[forest,gdBoost,mlp]
                names= ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"]




                ##Using all the features
示例#4
0
import json

from sklearn import metrics as m
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2

from GetDataSet import getDataSubSet
from Validation import manual_cross_validation

X, y, X_test, Y_test = getDataSubSet(
    [3, 5, 34, 35, 38, 39, 40, 42, 51, 52, 56, 60, 62, 99])

## Count number of 'easy' labeled instances and total instances
# This is done to keep control of the correct distribution of the dataset and the parameters of the experiment.
easyCount = 0
totalCount = 0
for i in xrange(len(Y_test)):
    if Y_test[i] == "Easy":
        easyCount += 1
    totalCount += 1
print("Ratio of Easy over all on testing set: %0.2f" %
      ((easyCount + 0.0) / len(Y_test)))
easyCount = 0
for i in xrange(len(y)):
    if y[i] == "Easy":
        easyCount += 1
示例#5
0
from imblearn.over_sampling import SMOTE

from GetDataSet import getDataSubSet

#set that defines the different sets of features that will be used
#features = [1,2,3,4,5,6,7,44]+[x for x in xrange(8,15)]+[31,32,33,34,45,50]+[c for c in xrange(35,44)]+[56,57,58,60,65,67,68,84]
#features =[3, 5, 34, 35, 38, 39, 40, 42, 51, 52, 56, 60, 62, 99]
features = [x for x in range(100, 300)]
print features
# Vars to select the best suited model
bestModel = None
bestName = "none"
bestMean = 0.0
bestSet = []

X, y, X_test, Y_test = getDataSubSet(features)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
x_norm = scaler.transform(X)

#initialize models
#forest= RandomForestClassifier(n_estimators=9100, max_depth=300, random_state=111)
gdBoost = GradientBoostingClassifier(random_state=111)
# mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=111)
# models=[forest,gdBoost,mlp]
# names= ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"]
#
#
#
# ##Using all the features
# model,name,mean=manual_cross_validation(x_norm, y, models, names, True)