def getTrainData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if isFile(X_trainName) and isFile(y_trainName): info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) return X_train, y_train else: error("Train data is not ready") return None
def readUptake(config): info("Getting uptake data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) inputConfig = config['input'] trainName = setFile(datadir, inputConfig['train']) validName = setFile(datadir, inputConfig['valid']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: trainData = readCSV(trainName) validData = readCSV(validName) ## Remove 'market' from validation data validData.drop(labels='market', axis=1, inplace=True) pddf = joinTrainValid(trainData, validData) info("Saving training and validation data") saveJoblib(dataName, pddf, compress) info("Wrote training and validation data to " + dataName) info("Saving feature data") writeDropList(dlFile, pddf, dlData=None) return pddf
def readKDD99(config): info("Getting KDD '99 data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: info("Downloading KDD '99 data", ind=2) tmp = datasets.fetch_kddcup99() X = tmp['data'] y = tmp['target'] y = y.reshape((y.shape[0], 1)) pddf = DataFrame(append(arr=X, values=y, axis=1)) tmp = pddf.head(n=1000) for column in tmp.columns: try: tmp[column].mean() pddf[column] = to_numeric(pddf[column], errors="coerce") except: continue colFile = setFile(datadir, "names.dat") colnames = open(colFile).readlines() targets = colnames[0].split(",") columns = [x.split(":")[0] for x in colnames[1:]] columns.append("TARGET") pddf.columns = columns info("Saving data to {0}".format(dataName)) saveJoblib(jlfile=dataName, jldata=pddf, compress=compress) info("Saving feature data to {0}".format(dlFile)) writeDropList(dlFile, pddf, dlData=None) return pddf
def loadTrainTestData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if all([ isFile(X_trainName), isFile(X_testName), isFile(X_validName), isFile(y_trainName), isFile(y_testName), isFile(y_validName) ]): info("Loading saved final train/test datasets.", ind=2) info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(X_testName), ind=4) X_test = getJoblib(X_testName) info("Found data that is {0}".format(getDim(X_test)), ind=4) info("Loading {0}".format(X_validName), ind=4) X_valid = getJoblib(X_validName) info("Found data that is {0}".format(getDim(X_valid)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) info("Loading {0}".format(y_testName), ind=4) y_test = getJoblib(y_testName) info("Found data that is {0}".format(getDim(y_test)), ind=4) info("Loading {0}".format(y_validName), ind=4) y_valid = getJoblib(y_validName) info("Found data that is {0}".format(getDim(y_valid)), ind=4) return X_train, X_test, X_valid, y_train, y_test, y_valid else: error("Train/test datasets are not ready!")
def getTrips(): savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p") data = getJoblib(savefile) return data
def getTrainedModel(config, modelname): modelFileName = getModelFileName(config, modelname) modelResults = getJoblib(jlfile=modelFileName) return modelResults