Exemplo n.º 1
0
def readUptake(config):
    info("Getting uptake data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    inputConfig = config['input']
    trainName = setFile(datadir, inputConfig['train'])
    validName = setFile(datadir, inputConfig['valid'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        trainData = readCSV(trainName)
        validData = readCSV(validName)

        ## Remove 'market' from validation data
        validData.drop(labels='market', axis=1, inplace=True)

        pddf = joinTrainValid(trainData, validData)
        info("Saving training and validation data")
        saveJoblib(dataName, pddf, compress)
        info("Wrote training and validation data to " + dataName)

        info("Saving feature data")
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 2
0
def generateRouteFeatures(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        features[driverID] = createRoutes(driverData, driverID)
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverPaths.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 3
0
def readTrips():
    drivers = findDirs(
        "/Users/tgadfort/Documents/pymva/axa/Axa-Insurance-Telematics-Kaggle")
    drivers = [getBasename(x) for x in drivers]

    data = {}
    for driverID in drivers:
        print "Reading trips from driver {0}".format(driverID)
        data[driverID] = readDriverTrips(driverID)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p")
    saveJoblib(savefile, data, compress=True)
Exemplo n.º 4
0
def saveTrainedModel(config, modelResults):
    modelname = modelResults['name']
    modelFileName = getModelFileName(config, modelname)
    
    if modelname == "tpot":
        tpotObj = modelResults['estimator']
        tpotFileName = modelFileName.replace(".p", ".py")
        tpotObj.export(tpotFileName)
        del modelResults['estimator']
        saveJoblib(jlfile=modelFileName, jldata=modelResults, compress=True)
    else:
        saveJoblib(jlfile=modelFileName, jldata=modelResults, compress=True)
Exemplo n.º 5
0
def generateDriverModels(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        dm = DriverModel(driverID, driverData)
        results, headers = dm.agg_mat, dm.agg_headers
        results = nan_to_num(results)
        features[driverID] = results
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverModels.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 6
0
def readKDD99(config):
    info("Getting KDD '99 data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        info("Downloading KDD '99 data", ind=2)
        tmp = datasets.fetch_kddcup99()
        X = tmp['data']
        y = tmp['target']
        y = y.reshape((y.shape[0], 1))
        pddf = DataFrame(append(arr=X, values=y, axis=1))

        tmp = pddf.head(n=1000)
        for column in tmp.columns:
            try:
                tmp[column].mean()
                pddf[column] = to_numeric(pddf[column], errors="coerce")
            except:
                continue

        colFile = setFile(datadir, "names.dat")
        colnames = open(colFile).readlines()
        targets = colnames[0].split(",")
        columns = [x.split(":")[0] for x in colnames[1:]]
        columns.append("TARGET")
        pddf.columns = columns

        info("Saving data to {0}".format(dataName))
        saveJoblib(jlfile=dataName, jldata=pddf, compress=compress)

        info("Saving feature data to {0}".format(dlFile))
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 7
0
def generateTripFeatures(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        results = None
        for j, trip in enumerate(driverData):
            tripResults = tripFeatures(trip.values)
            if results is None:
                results = tripResults
            else:
                results = vstack((results, tripResults))

        results = nan_to_num(results)
        features[driverID] = results
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa",
                       "driverTripFeatures.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 8
0
Arquivo: mnist.py Projeto: tgadf/pymva
def getMNIST():
    datadir = "/Users/tgadfort/Documents/pymva/data"
    outdir = setDir(datadir, "mnist")

    names = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    for name in names:
        url = "http://yann.lecun.com/exdb/mnist/" + name
        savename = setFile(outdir, name)

        if not isFile(savename):
            urlretrieve(url, savename)
            statinfo = stat(savename)
            print('Succesfully downloaded', savename, statinfo.st_size,
                  'bytes.')

        name = name.replace(".gz", ".p")
        npfile = setFile(outdir, name)
        if not isFile(npfile):
            data = extract_images(savename)
            saveJoblib(npfile, data)
Exemplo n.º 9
0
def getTrainTestData(pddf, config):
    info("Creating final train/test datasets.", ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    outputConfig = config['output']
    compress = outputConfig['compress']

    if not isColumn(pddf, targetcol):
        raise ValueError("Target column", targetcol,
                         "is not included in data!")

    ## Determine if the data showed up split (seperate train/test files)
    isSplit = False
    isValid = False
    if isColumn(pddf, "isTrain"):
        info("Data is already split", ind=2)
        isSplit = True
    elif isColumn(pddf, "isValid"):
        info("Validation data is ready, but train/test data must be created",
             ind=2)
        isValid = True
    else:
        info("Train/test data must be created", ind=2)

    ## Create data if it's split
    if isSplit:
        info("Splitting train data", ind=2)
        X_train = pddf[pddf['isTrain'] == 1]
        y_train = X_train[targetcol]
        X_train.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)

        info("Splitting test data", ind=2)
        X_test = pddf[pddf['isTrain'] == 0]
        y_test = X_test[targetcol]
        X_test.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)

        X_valid = None
        y_valid = None
    elif isValid:
        info("Splitting validation data", ind=2)
        X_valid = pddf[pddf['isValid'] == 1]
        y_valid = X_valid[targetcol]

        info("Creating train/test data that contains validated data", ind=2)
        X_data = pddf[pddf['isValid'] == 0]
        y = X_data[targetcol]
        X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                            y,
                                                            test_size=0.2)
    else:
        info("Creating train/test data that is not already split or validated",
             ind=2)
        y = pddf[targetcol]
        pddf.drop(labels=[targetcol], axis=1, inplace=True)
        X_train, X_test, y_train, y_test = train_test_split(pddf,
                                                            y,
                                                            test_size=0.2)
        X_valid = None
        y_valid = None

    if isSplit:
        info("Dropping {0} from DataFrame".format(", ".join(
            [targetcol, 'isTrain'])))
        pddf.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True)
    elif isValid:
        info("Dropping {0} from DataFrame".format(", ".join(
            [targetcol, 'isValid'])))
        pddf.drop(labels=[targetcol, 'isValid'], axis=1, inplace=True)

    X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames(
        config)

    info("Saving {0} data to {1}".format(getDim(X_train), X_trainName), ind=4)
    saveJoblib(X_trainName, X_train, compress)
    info("Saving {0} data to {1}".format(getDim(X_test), X_testName), ind=4)
    saveJoblib(X_testName, X_test, compress)
    info("Saving {0} data to {1}".format(getDim(X_valid), X_validName), ind=4)
    saveJoblib(X_validName, X_valid, compress)

    info("Saving {0} data to {1}".format(getDim(y_train), y_trainName), ind=4)
    saveJoblib(y_trainName, y_train, compress)
    info("Saving {0} data to {1}".format(getDim(y_test), y_testName), ind=4)
    saveJoblib(y_testName, y_test, compress)
    info("Saving {0} data to {1}".format(getDim(y_valid), y_validName), ind=4)
    saveJoblib(y_validName, y_valid, compress)

    return X_train, X_test, X_valid, y_train, y_test, y_valid