Exemplo n.º 1
0
def readUptake(config):
    info("Getting uptake data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    inputConfig = config['input']
    trainName = setFile(datadir, inputConfig['train'])
    validName = setFile(datadir, inputConfig['valid'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        trainData = readCSV(trainName)
        validData = readCSV(validName)

        ## Remove 'market' from validation data
        validData.drop(labels='market', axis=1, inplace=True)

        pddf = joinTrainValid(trainData, validData)
        info("Saving training and validation data")
        saveJoblib(dataName, pddf, compress)
        info("Wrote training and validation data to " + dataName)

        info("Saving feature data")
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 2
0
def dropData(pddf, config):
    info("Dropping columns", ind=4)
    basepath = config['basepath']
    name = config['name']
    dropListFile = config['feature']['dropList']

    dname = setSubDir(basepath, ['data', name])
    dlFile = setFile(dname, dropListFile)
    if not isFile(dlFile):
        info("There is no drop file. Not doing anything.", ind=4)
        return

    widths = [
        int(x) for x in open(dlFile).readline().replace("\n", "").split(',')
    ]
    dlData = read_fwf(dlFile, widths=widths, skiprows=1)

    drops = dlData['Feature'][dlData['Drop?'] == 1]
    info("Dropping " + getNrows(drops, asStr=True) + " columns", ind=6)
    info("Data has " + getNrows(pddf, asStr=True) + " rows and " +
         getNcols(pddf, asStr=True) + " cols",
         ind=6)
    pddf.drop(labels=drops.values, axis=1, inplace=True)
    info("Data now has " + getNrows(pddf, asStr=True) + " rows and " +
         getNcols(pddf, asStr=True) + " cols",
         ind=6)
Exemplo n.º 3
0
def analyzeColumns(pddf, config):
    info("Analyzing " + getNcols(pddf, asStr=True) +
         " columns to possible drops.",
         ind=2)

    targetConfig = config['target']
    targetcol = targetConfig['colname']
    #problemType     = config['problem']
    #positiveTarget  = config['positiveTarget']

    #if isClassification(problemType):
    #    targetData  = trainData[targetcol]

    basepath = config['basepath']
    name = config['name']
    dropListFile = config['feature']['dropList']

    dname = setSubDir(basepath, ['data', name])
    dlFile = setFile(dname, dropListFile)
    if not isFile(dlFile):
        info("There is no drop file. Not doing anything.", ind=4)
        return

    widths = [
        int(x) for x in open(dlFile).readline().replace("\n", "").split(',')
    ]
    dlData = read_fwf(dlFile, widths=widths, skiprows=1)

    ## Keep record of overrides
    overrides = dlData['Feature'][dlData['Drop?'].isnull() == False]

    ## Set drop to 0 initially
    dlData['Drop?'].fillna(0, inplace=True)

    ## Drop anything with high cardinality (>50)
    dlData['Card'] = dlData['Card'].apply(to_numeric, errors='coerce')
    dlData['Card'].fillna(0, inplace=True)
    dlData.loc[dlData['Card'] >= 200, 'Drop?'] = 1
    drops = dlData['Feature'][dlData['Drop?'] == 1]

    ## Drop with more than 20% missing data
    maxNA = getNrows(pddf) * 0.25
    dlData.loc[dlData['nNA'] >= maxNA, 'Drop?'] = 1
    drops = dlData['Feature'][dlData['Drop?'] == 1]

    ## Fill overrides
    #if getNrows(overrides) > 0:
    #    dlData.loc[dlData['Feature'].isin(overrides['Feature']), 'Drop?'] = overrides['Drop?']

    ## Lastly, make sure we don't trop the target
    dlData.loc[dlData['Feature'] == targetcol, "Drop?"] = 0

    ## Show features to drop
    drops = dlData['Feature'][dlData['Drop?'] == 1]
    print drops

    ## Rewrite drop list
    writeDropList(dlFile, pddf=None, dlData=dlData)
Exemplo n.º 4
0
def getModelFileName(config, modelname):
    prefix = None
    if isRegression(config['problem']):
        prefix = "regressor"
    elif isClassification(config['problem']):
        prefix = "classifier"
    modelFileName = setFile(getModelsDir(config), "{0}-{1}.p".format(prefix,modelname))
    
    return modelFileName
Exemplo n.º 5
0
def readKDD99(config):
    info("Getting KDD '99 data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        info("Downloading KDD '99 data", ind=2)
        tmp = datasets.fetch_kddcup99()
        X = tmp['data']
        y = tmp['target']
        y = y.reshape((y.shape[0], 1))
        pddf = DataFrame(append(arr=X, values=y, axis=1))

        tmp = pddf.head(n=1000)
        for column in tmp.columns:
            try:
                tmp[column].mean()
                pddf[column] = to_numeric(pddf[column], errors="coerce")
            except:
                continue

        colFile = setFile(datadir, "names.dat")
        colnames = open(colFile).readlines()
        targets = colnames[0].split(",")
        columns = [x.split(":")[0] for x in colnames[1:]]
        columns.append("TARGET")
        pddf.columns = columns

        info("Saving data to {0}".format(dataName))
        saveJoblib(jlfile=dataName, jldata=pddf, compress=compress)

        info("Saving feature data to {0}".format(dlFile))
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Exemplo n.º 6
0
def generateRouteFeatures(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        features[driverID] = createRoutes(driverData, driverID)
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverPaths.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 7
0
def readTrips():
    drivers = findDirs(
        "/Users/tgadfort/Documents/pymva/axa/Axa-Insurance-Telematics-Kaggle")
    drivers = [getBasename(x) for x in drivers]

    data = {}
    for driverID in drivers:
        print "Reading trips from driver {0}".format(driverID)
        data[driverID] = readDriverTrips(driverID)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p")
    saveJoblib(savefile, data, compress=True)
Exemplo n.º 8
0
Arquivo: mnist.py Projeto: tgadf/pymva
def getMNIST():
    datadir = "/Users/tgadfort/Documents/pymva/data"
    outdir = setDir(datadir, "mnist")

    names = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    for name in names:
        url = "http://yann.lecun.com/exdb/mnist/" + name
        savename = setFile(outdir, name)

        if not isFile(savename):
            urlretrieve(url, savename)
            statinfo = stat(savename)
            print('Succesfully downloaded', savename, statinfo.st_size,
                  'bytes.')

        name = name.replace(".gz", ".p")
        npfile = setFile(outdir, name)
        if not isFile(npfile):
            data = extract_images(savename)
            saveJoblib(npfile, data)
Exemplo n.º 9
0
def getTrainTestNames(config):
    dname = getDataDir(config)
    X_trainName = setFile(dname, "X_train.p")
    X_testName = setFile(dname, "X_test.p")
    X_validName = setFile(dname, "X_valid.p")
    y_trainName = setFile(dname, "y_train.p")
    y_testName = setFile(dname, "y_test.p")
    y_validName = setFile(dname, "y_valid.p")
    return X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName
Exemplo n.º 10
0
def generateDriverModels(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        dm = DriverModel(driverID, driverData)
        results, headers = dm.agg_mat, dm.agg_headers
        results = nan_to_num(results)
        features[driverID] = results
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverModels.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 11
0
def plotConfusionMatrix(perfs, config, outdir, ext, pp=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    title = 'Confusion Matrix'
    normalize = True
    cmap = plt.cm.Blues

    try:
        cm = perfs['xgboost']['Confusion']['matrix']
    except:
        return
    classes = getTargetNames(config)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,
                 i,
                 format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    value = title
    plotname = setFile(outdir, ".".join([value, ext]))
    info("Saving {0} plot to {1}".format(title, plotname), ind=4)
    plt.savefig(plotname)

    plt.close()
Exemplo n.º 12
0
def plotResults(perfs, y_truth, config):
    info("Making Performance Plots", ind=0)

    outdir = getPlotsDir(config)
    performanceConfig = config['performance']
    ext = performanceConfig['ext']
    isPdf = ext == 'pdf'
    isMultipage = performanceConfig['multipage']
    if isMultipage and isPdf:
        pdfname = setFile(outdir, 'results.pdf')
        info("Saving all performance plots to {0}".format(pdfname), ind=2)
        pp = PdfPages(pdfname)
    else:
        info("Saving all performance plots individually as {0}".format(ext),
             ind=2)
        pp = None

    badModels = [x for x in perfs.keys() if len(perfs[x]) == 0]
    for modelname in badModels:
        info("Not plotting {0}".format(modelname))
        del perfs[modelname]

    if isClassification(config['problem']):
        plotKappa(perfs, outdir, ext, pp)
        plotPrecision(perfs, outdir, ext, pp)
        plotRecall(perfs, outdir, ext, pp)
        plotLogLoss(perfs, outdir, ext, pp)
        plotAccuracy(perfs, outdir, ext, pp)
        plotPrecisionRecall(perfs, outdir, ext, pp)
        plotROC(perfs, outdir, ext, pp)
        plotConfusionMatrix(perfs, config, outdir, ext, pp)

    if isRegression(config['problem']):
        plotMAE(perfs, outdir, ext, pp)
        plotMSE(perfs, outdir, ext, pp)
        plotExplainedVariance(perfs, outdir, ext, pp)
        plotR2(perfs, outdir, ext, pp)
        plotResiduals(perfs, outdir, ext, pp)

    if isMultipage and isPdf:
        info("Closing multipage pdf", ind=2)
        pp.savefig()
        pp.close()
Exemplo n.º 13
0
def plotResidualsAndPrediction(perfs, y_test, outdir, ext, pp=None):
    sns.set(style="whitegrid")

    modelnames = perfs.keys()
    x = y_test
    x.name = "Truth"

    for i, modelname in enumerate(modelnames):
        y = perfs[modelname]['Residuals']
        title = "{0} Residuals And Prediction".format(modelname)
        y.name = "Residuals"

        # Plot the residuals after fitting a linear model
        ax = sns.residplot(x, y, lowess=True, color="b")
        ax.set_title(title)

        value = title
        plotname = setFile(outdir, ".".join([value, ext]))
        info("Saving {0} plot to {1}".format(title, plotname), ind=4)
        plt.savefig(plotname)
Exemplo n.º 14
0
def generateTripFeatures(data):
    t0 = start()
    features = {}
    for i, driverID in enumerate(data.keys()):
        driverData = data[driverID]
        print "Process driver {0}".format(driverID)
        results = None
        for j, trip in enumerate(driverData):
            tripResults = tripFeatures(trip.values)
            if results is None:
                results = tripResults
            else:
                results = vstack((results, tripResults))

        results = nan_to_num(results)
        features[driverID] = results
        if i % 5 == 0: inter(t0, i, len(data))
    end(t0)

    savefile = setFile("/Users/tgadfort/Documents/pymva/axa",
                       "driverTripFeatures.p")
    saveJoblib(savefile, features, compress=True)
Exemplo n.º 15
0
Arquivo: bar.py Projeto: tgadf/pymva
def plotBar(perfs, value, title, outdir, ext, pp=None):
    sns.set_style("whitegrid")

    modelnames = perfs.keys()

    values = [perfs[x][value] for x in modelnames]
    ax = sns.barplot(x=modelnames, y=values)
    ax.set_title(title)

    for item in ax.get_xticklabels():
        item.set_rotation(45)
    #plt.show()

    if pp is not None:
        info("Saving {0} plot to multipage pdf".format(title), ind=4)
        pp.savefig()
    else:
        plotname = setFile(outdir, ".".join([value, ext]))
        info("Saving {0} plot to {1}".format(title, plotname), ind=4)
        plt.savefig(plotname)

    plt.close()
Exemplo n.º 16
0
def plotROC(perfs, outdir, ext, pp=None):
    info("Plotting ROC Curves for {0} Classifiers".format(len(perfs)))
    modelnames = perfs.keys()

    plt.figure()
    current_palette = sns.color_palette()
    #colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, modelname in enumerate(modelnames):
        perfdata = perfs[modelname]
        auc = perfdata['AUC']
        tpr = perfdata['ROC']['tpr']
        fpr = perfdata['ROC']['fpr']
        plt.plot(fpr,
                 tpr,
                 label='{0} ({1:0.2f})'
                 ''.format(modelname, auc),
                 color=current_palette[i],
                 linestyle='-',
                 linewidth=3)

    title = "Receiver Operating Characteristic"
    value = "ROC"
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('')
    plt.legend(loc="lower right")

    if pp is not None:
        info("Saving {0} plot to multipage pdf".format(title), ind=4)
        pp.savefig()
    else:
        plotname = setFile(outdir, ".".join([value, ext]))
        info("Saving {0} plot to {1}".format(title, plotname), ind=4)
        plt.savefig(plotname)

    plt.close()
Exemplo n.º 17
0
def plotResiduals(perfs, outdir, ext, pp=None):
    sns.set(style="whitegrid")

    modelnames = perfs.keys()

    for i, modelname in enumerate(modelnames):
        y = perfs[modelname]['Residuals']
        miny = np.percentile(y, 1)
        maxy = np.percentile(y, 99)
        capy = np.copy(y)
        capy[capy < miny] = miny
        capy[capy > maxy] = maxy
        y.name = "Residuals"
        ax = sns.distplot(capy, rug=False, label=modelname)

    title = "Residuals"
    ax.set_title(title)
    ax.legend()

    value = "Residuals"
    plotname = setFile(outdir, ".".join([value, ext]))
    info("Saving {0} plot to {1}".format(title, plotname), ind=4)
    plt.savefig(plotname)
    plt.close()
Exemplo n.º 18
0
def loadConfig():
    configname = setFile("/Users/tgadfort/Documents/pymva", "config.yaml")
    info("Importing [{0}]".format(configname), ind=0)
    config = get(configname)
    return config
Exemplo n.º 19
0
def getTrips():
    savefile = setFile("/Users/tgadfort/Documents/pymva/axa", "driverData.p")
    data = getJoblib(savefile)
    return data