# -*- coding: utf-8 -*- import dataMngr as dm import compInfo def preprocess(fileName, train=True): df = dm.file2Dataframe(fileName) inputArr, outputArr = dm.processDataframeForNP(df, train) inputFileName, outputFileName = dm.makeFileName(fileName) if train: dm.save(outputFileName, outputArr) dm.save(inputFileName, inputArr) return inputArr, outputArr print compInfo.dataPoints500File inA, outA = preprocess(compInfo.dataPoints500File) print inA.shape print outA.shape loadedInA = dm.load(compInfo.outputDataDirectory + 'Innocentive_500_Sample_input.np') loadedOutA = dm.load(compInfo.outputDataDirectory + 'Innocentive_500_Sample_output.np') print loadedInA.shape print loadedOutA.shape
def makeTestOutput(modelfp, inTestFileName, outTestFileName): ML.generateTestOutput(dm.load(modelfp), dm.load(inTestFileName), outTestFileName)
from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score import dataMngr as dm # only generated model, doesn't attempt to fit model def generateModel(model, modelArrParams, modelDictParams): if modelArrParams is None: modelArrParams = [] if modelDictParams is None: modelDictParams = {} standardModel = model(*modelArrParams, **modelDictParams) return standardModel def generateModelScores(stdModel, inData, outData, modelFileName, statisticsFileName): outData = outData.ravel() scores = cross_val_score(stdModel, inData, outData) saveModelAndScores(modelFileName, stdModel, statisticsFileName, scores) return scores.mean() def saveModelAndScores(modelFileName, stdModel, statisticsFileName, scores): dm.save(modelFileName, stdModel) dm.save(statisticsFileName, scores.mean()) def generateModelAndScores(model, modelArrParams, modelDictParams, inData, outData, numRuns): for x in xrange(numRuns): stdModel = generateModel(model, modelArrParams, modelDictParams) print generateModelScores(stdModel, inData, outData, "RF" + str(x) + ".np", "RF_Stats" + str(x) + ".np") print generateModelAndScores(RandomForestClassifier, None, None, dm.load("Innocentive_500_Sample_input.np"), dm.load("Innocentive_500_Sample_output.np"), 5)
def generateTestOutput(modelfp, testOutfp, inData): ML.generateTestOutput(dm.load(modelfp), inData, testOutfp)
def checkCrossValidation(inDatafp, outDatafp, modelInfo, numFolds): inData = dm.load(inDatafp) outData = dm.load(outDatafp).ravel() model = generateUnfittedModel(modelInfo["model"], modelInfo["modelArrParameters"], modelInfo["modelDictParameters"]) return cv.cross_val_score(model, inData, outData, cv=numFolds)