Exemplo n.º 1
0
def readUptake(config):
    info("Getting uptake data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    inputConfig = config['input']
    trainName = setFile(datadir, inputConfig['train'])
    validName = setFile(datadir, inputConfig['valid'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        trainData = readCSV(trainName)
        validData = readCSV(validName)

        ## Remove 'market' from validation data
        validData.drop(labels='market', axis=1, inplace=True)

        pddf = joinTrainValid(trainData, validData)
        info("Saving training and validation data")
        saveJoblib(dataName, pddf, compress)
        info("Wrote training and validation data to " + dataName)

        info("Saving feature data")
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 2
0
def getTrainData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if isFile(X_trainName) and isFile(y_trainName):
        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        return X_train, y_train
    else:
        error("Train data is not ready")
        return None
Exemplo n.º 3
0
def runModels(config, models=None, force=False):

    X_train, X_test, X_valid, y_train, y_test, y_valid = createData(config)

    if not isinstance(models, list):
        models = [models]

    perfs = {}
    for modelname in models:
        modelFileName = getModelFileName(config, modelname)
        if isFile(modelFileName):
            info("Already have {0} estimator.".format(modelname))
            if force is False:
                continue
            else:
                info("Will rerun {0} estimator.".format(modelname))

        clf = trainModel(modelname, X_train, y_train, config)
        tval = testModel(modelname, clf, X_test, config)
        perf = getModelPerformance(y_test, tval, config)

        modelResults = {
            "name": modelname,
            "estimator": clf,
            "test": tval,
            "perf": perf
        }
        info("Saving {0} estimator".format(modelname), ind=2)
        saveTrainedModel(config, modelResults)
        perfs[modelname] = perf
Exemplo n.º 4
0
def dropData(pddf, config):
    info("Dropping columns", ind=4)
    basepath = config['basepath']
    name = config['name']
    dropListFile = config['feature']['dropList']

    dname = setSubDir(basepath, ['data', name])
    dlFile = setFile(dname, dropListFile)
    if not isFile(dlFile):
        info("There is no drop file. Not doing anything.", ind=4)
        return

    widths = [
        int(x) for x in open(dlFile).readline().replace("\n", "").split(',')
    ]
    dlData = read_fwf(dlFile, widths=widths, skiprows=1)

    drops = dlData['Feature'][dlData['Drop?'] == 1]
    info("Dropping " + getNrows(drops, asStr=True) + " columns", ind=6)
    info("Data has " + getNrows(pddf, asStr=True) + " rows and " +
         getNcols(pddf, asStr=True) + " cols",
         ind=6)
    pddf.drop(labels=drops.values, axis=1, inplace=True)
    info("Data now has " + getNrows(pddf, asStr=True) + " rows and " +
         getNcols(pddf, asStr=True) + " cols",
         ind=6)
Exemplo n.º 5
0
def analyzeColumns(pddf, config):
    info("Analyzing " + getNcols(pddf, asStr=True) +
         " columns to possible drops.",
         ind=2)

    targetConfig = config['target']
    targetcol = targetConfig['colname']
    #problemType     = config['problem']
    #positiveTarget  = config['positiveTarget']

    #if isClassification(problemType):
    #    targetData  = trainData[targetcol]

    basepath = config['basepath']
    name = config['name']
    dropListFile = config['feature']['dropList']

    dname = setSubDir(basepath, ['data', name])
    dlFile = setFile(dname, dropListFile)
    if not isFile(dlFile):
        info("There is no drop file. Not doing anything.", ind=4)
        return

    widths = [
        int(x) for x in open(dlFile).readline().replace("\n", "").split(',')
    ]
    dlData = read_fwf(dlFile, widths=widths, skiprows=1)

    ## Keep record of overrides
    overrides = dlData['Feature'][dlData['Drop?'].isnull() == False]

    ## Set drop to 0 initially
    dlData['Drop?'].fillna(0, inplace=True)

    ## Drop anything with high cardinality (>50)
    dlData['Card'] = dlData['Card'].apply(to_numeric, errors='coerce')
    dlData['Card'].fillna(0, inplace=True)
    dlData.loc[dlData['Card'] >= 200, 'Drop?'] = 1
    drops = dlData['Feature'][dlData['Drop?'] == 1]

    ## Drop with more than 20% missing data
    maxNA = getNrows(pddf) * 0.25
    dlData.loc[dlData['nNA'] >= maxNA, 'Drop?'] = 1
    drops = dlData['Feature'][dlData['Drop?'] == 1]

    ## Fill overrides
    #if getNrows(overrides) > 0:
    #    dlData.loc[dlData['Feature'].isin(overrides['Feature']), 'Drop?'] = overrides['Drop?']

    ## Lastly, make sure we don't trop the target
    dlData.loc[dlData['Feature'] == targetcol, "Drop?"] = 0

    ## Show features to drop
    drops = dlData['Feature'][dlData['Drop?'] == 1]
    print drops

    ## Rewrite drop list
    writeDropList(dlFile, pddf=None, dlData=dlData)
Exemplo n.º 6
0
def readKDD99(config):
    info("Getting KDD '99 data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        info("Downloading KDD '99 data", ind=2)
        tmp = datasets.fetch_kddcup99()
        X = tmp['data']
        y = tmp['target']
        y = y.reshape((y.shape[0], 1))
        pddf = DataFrame(append(arr=X, values=y, axis=1))

        tmp = pddf.head(n=1000)
        for column in tmp.columns:
            try:
                tmp[column].mean()
                pddf[column] = to_numeric(pddf[column], errors="coerce")
            except:
                continue

        colFile = setFile(datadir, "names.dat")
        colnames = open(colFile).readlines()
        targets = colnames[0].split(",")
        columns = [x.split(":")[0] for x in colnames[1:]]
        columns.append("TARGET")
        pddf.columns = columns

        info("Saving data to {0}".format(dataName))
        saveJoblib(jlfile=dataName, jldata=pddf, compress=compress)

        info("Saving feature data to {0}".format(dlFile))
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 7
0
Arquivo: mnist.py Projeto: tgadf/pymva
def getMNIST():
    datadir = "/Users/tgadfort/Documents/pymva/data"
    outdir = setDir(datadir, "mnist")

    names = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    for name in names:
        url = "http://yann.lecun.com/exdb/mnist/" + name
        savename = setFile(outdir, name)

        if not isFile(savename):
            urlretrieve(url, savename)
            statinfo = stat(savename)
            print('Succesfully downloaded', savename, statinfo.st_size,
                  'bytes.')

        name = name.replace(".gz", ".p")
        npfile = setFile(outdir, name)
        if not isFile(npfile):
            data = extract_images(savename)
            saveJoblib(npfile, data)
Exemplo n.º 8
0
def isSplitDataReady(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if all([
            isFile(X_trainName),
            isFile(X_testName),
            isFile(X_validName),
            isFile(y_trainName),
            isFile(y_testName),
            isFile(y_validName)
    ]):
        return True
    else:
        return False
Exemplo n.º 9
0
def loadTrainTestData(config):
    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)
    if all([
            isFile(X_trainName),
            isFile(X_testName),
            isFile(X_validName),
            isFile(y_trainName),
            isFile(y_testName),
            isFile(y_validName)
    ]):
        info("Loading saved final train/test datasets.", ind=2)

        info("Loading {0}".format(X_trainName), ind=4)
        X_train = getJoblib(X_trainName)
        info("Found data that is {0}".format(getDim(X_train)), ind=4)
        info("Loading {0}".format(X_testName), ind=4)
        X_test = getJoblib(X_testName)
        info("Found data that is {0}".format(getDim(X_test)), ind=4)
        info("Loading {0}".format(X_validName), ind=4)
        X_valid = getJoblib(X_validName)
        info("Found data that is {0}".format(getDim(X_valid)), ind=4)

        info("Loading {0}".format(y_trainName), ind=4)
        y_train = getJoblib(y_trainName)
        info("Found data that is {0}".format(getDim(y_train)), ind=4)
        info("Loading {0}".format(y_testName), ind=4)
        y_test = getJoblib(y_testName)
        info("Found data that is {0}".format(getDim(y_test)), ind=4)
        info("Loading {0}".format(y_validName), ind=4)
        y_valid = getJoblib(y_validName)
        info("Found data that is {0}".format(getDim(y_valid)), ind=4)

        return X_train, X_test, X_valid, y_train, y_test, y_valid
    else:
        error("Train/test datasets are not ready!")