Exemplo n.º 1
0
def getTrainData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if isFile(X_trainName) and isFile(y_trainName):
        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        return X_train, y_train
    else:
        error("Train data is not ready")
        return None
Exemplo n.º 2
0
def readUptake(config):
    info("Getting uptake data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    inputConfig = config['input']
    trainName = setFile(datadir, inputConfig['train'])
    validName = setFile(datadir, inputConfig['valid'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        trainData = readCSV(trainName)
        validData = readCSV(validName)

        ## Remove 'market' from validation data
        validData.drop(labels='market', axis=1, inplace=True)

        pddf = joinTrainValid(trainData, validData)
        info("Saving training and validation data")
        saveJoblib(dataName, pddf, compress)
        info("Wrote training and validation data to " + dataName)

        info("Saving feature data")
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 3
0
def readKDD99(config):
    info("Getting KDD '99 data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        info("Downloading KDD '99 data", ind=2)
        tmp = datasets.fetch_kddcup99()
        X = tmp['data']
        y = tmp['target']
        y = y.reshape((y.shape[0], 1))
        pddf = DataFrame(append(arr=X, values=y, axis=1))

        tmp = pddf.head(n=1000)
        for column in tmp.columns:
            try:
                tmp[column].mean()
                pddf[column] = to_numeric(pddf[column], errors="coerce")
            except:
                continue

        colFile = setFile(datadir, "names.dat")
        colnames = open(colFile).readlines()
        targets = colnames[0].split(",")
        columns = [x.split(":")[0] for x in colnames[1:]]
        columns.append("TARGET")
        pddf.columns = columns

        info("Saving data to {0}".format(dataName))
        saveJoblib(jlfile=dataName, jldata=pddf, compress=compress)

        info("Saving feature data to {0}".format(dlFile))
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 4
0
def loadTrainTestData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if all([
            isFile(X_trainName),
            isFile(X_testName),
            isFile(X_validName),
            isFile(y_trainName),
            isFile(y_testName),
            isFile(y_validName)
    ]):
        info("Loading saved final train/test datasets.", ind=2)

        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)
        info("Loading {0}".format(X_testName), ind=4)
        X_test = getJoblib(X_testName)
        info("Found data that is {0}".format(getDim(X_test)), ind=4)
        info("Loading {0}".format(X_validName), ind=4)
        X_valid = getJoblib(X_validName)
        info("Found data that is {0}".format(getDim(X_valid)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        info("Loading {0}".format(y_testName), ind=4)
        y_test = getJoblib(y_testName)
        info("Found data that is {0}".format(getDim(y_test)), ind=4)
        info("Loading {0}".format(y_validName), ind=4)
        y_valid = getJoblib(y_validName)
        info("Found data that is {0}".format(getDim(y_valid)), ind=4)

        return X_train, X_test, X_valid, y_train, y_test, y_valid
    else:
        error("Train/test datasets are not ready!")
Exemplo n.º 5
0
def getTrips():
    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p")
    data = getJoblib(savefile)
    return data
Exemplo n.º 6
0
def getTrainedModel(config, modelname):
    modelFileName = getModelFileName(config, modelname)    
    modelResults  = getJoblib(jlfile=modelFileName)
    return modelResults