예제 #1
0
	def run(self, _training, _model, _batchSize, _resultFile):
		csv = CSV(_training)
		csv.randomize(1000)
		csv.removeIndices()

		R = ResultMatrix()
		for i in range(int(len(csv.data)/_batchSize)):
			c = CSV()
			c.header = csv.header
			c.data = csv.data[0:(i+1)*_batchSize]

			file = self.resultFolder + "subset_" + str(i) + ".csv"
			c.save(file)

			header, data = Experiment(file).regression([_model], 10)
			R.add(header, data)

		R.save(_resultFile)
예제 #2
0
    def exportWeights(self, _features, _file):
        M = CSV()

        if len(self.classes) > 0:
            M.header = ['class0', 'class1'] + _features
        else:
            M.header = _features

        for c in range(len(self.weights)):
            W = self.weights[c]

            F = []
            for feature in _features:
                if feature in W:
                    F.append(W[feature])
                else:
                    F.append(0)

            if len(self.classes) > 0:
                M.data.append(','.join(self.classes[c] + [str(x) for x in F]))
            else:
                M.data.append(','.join([str(x) for x in F]))
        M.save(_file)
예제 #3
0
from models.randomforest.RandomForest import RandomForest
from experiment.Experiment import Experiment
from code.CodeGenerator import CodeGenerator
from data.CSV import CSV
from data.ResultMatrix import ResultMatrix
import numpy as np
import matplotlib.pyplot as plt
from plot.PlotTool import PlotTool
from plot.ResultVisualizer import ResultVisualizer

# define the training data set and set up the model
training = "../examples/mnoA.csv"
model = RandomForest()
model.config.trees = 10
model.config.depth = 5

# perform a 10-fold cross validation
e = Experiment(training, "example_rf_mdi")
e.regression([model], 10)

#
M = CSV(e.path("features_0.csv")).toMatrix()
M.normalizeRows()
M.sortByMean()
M.save(e.path("rf_features.csv"))

#
ResultVisualizer().barchart(e.path("rf_features.csv"), xlabel="Feature", ylabel="Relative Feature Importance", savePNG=e.path(e.id+".png"))
e = Experiment(training, "example_feature_reduction")
e.regression([model], 10)
CSV(e.path("cv_0.csv")).save(e.path("subset_0.csv"))
xTicks = ["None"]

# obtain a feature ranking
M = CSV(e.path("features_0.csv")).toMatrix()
M.normalizeRows()
M.sortByMean()

# sequentially remove the least important feature from the training data and retrain the model
subset = e.path("subset.csv")
for i in range(len(M.header) - 1):
    key = M.header[-1]
    M.header = M.header[0:-1]
    csv.removeColumnWithKey(key)
    csv.save(subset)

    e = Experiment(subset, "example_feature_reduction")
    e.regression([model], 10)
    CSV(e.path("cv_0.csv")).save(e.path("subset_" + str(i + 1) + ".csv"))
    xTicks.append(key)

#
files = [e.path("subset_" + str(i) + ".csv") for i in range(len(xTicks))]
ResultVisualizer().boxplots(files,
                            "r2",
                            xTicks,
                            xlabel='Sequentially Removed Features',
                            ylabel='R2',
                            savePNG=e.path("example_feature_reduction.png"))