Python loadPoint示例，aggregatedComparison.loadPoint Python示例

示例#1

0

显示文件

文件： createROC.py 项目： theseusyang/GEQE

def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile,
        inputPartitions=-1,
        writeFileOutput=True,
        bByDate=False,
        strStop='',
        modelName='random forest',
        num_features=-1):

    stopSet = set(strStop.split(',')) if strStop !='' else set()
    t0 = time.time()

    #Read in data
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions).cache()
    nGoodTweets = records.count()
    diff = time.time() - t1
    print "GEQE: Time to read in data:", diff

    #Read in dictionary
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])
    diff = time.time() - t1
    print "GEQE: Time to read in dict:", diff

    tAndP, nInApply, nOutApply = None, 0, 0
    if bByDate == True:
        print "GEQE: Generating event model"

    else:
        print "GEQE: Generating location model"
        (tAndP, nInApply, nOutApply) =  locationTest(sc, sqlContext, lPolygon, lStop,modelName=modelName,num_features=num_features)

    t1 = time.time()
    print "GEQE: Generating ROC from Truth and predictions"
    plotting.generateROCCurve(tAndP,nInApply,nOutApply,jobNm)

    diff = time.time() - t1
    print "GEQE: Time to make ROC:", diff

示例#2

0

显示文件

############# ############# ############# ############# #############

示例#3

0

显示文件

文件： testModel.py 项目： theseusyang/GEQE

############# ############# ############# ############# #############

示例#4

0

显示文件

文件： createROC.py 项目： nagyistge/GEQE

def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        inputPartitions=-1,
        writeFileOutput=True,
        bByDate=False,
        strStop='',
        modelName='random forest',
        num_features=-1):

    stopSet = set(strStop.split(',')) if strStop != '' else set()
    t0 = time.time()

    #Read in data
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions).cache()
    nGoodTweets = records.count()
    diff = time.time() - t1
    print "GEQE: Time to read in data:", diff

    #Read in dictionary
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])
    diff = time.time() - t1
    print "GEQE: Time to read in dict:", diff

    tAndP, nInApply, nOutApply = None, 0, 0
    if bByDate == True:
        print "GEQE: Generating event model"

    else:
        print "GEQE: Generating location model"
        (tAndP, nInApply, nOutApply) = locationTest(sc,
                                                    sqlContext,
                                                    lPolygon,
                                                    lStop,
                                                    modelName=modelName,
                                                    num_features=num_features)

    t1 = time.time()
    print "GEQE: Generating ROC from Truth and predictions"
    plotting.generateROCCurve(tAndP, nInApply, nOutApply, jobNm)

    diff = time.time() - t1
    print "GEQE: Time to make ROC:", diff

示例#5

0

显示文件

文件： refindSimilarPlaces.py 项目： nagyistge/GEQE

def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        dictFile,
        bByDate=False,
        inputPartitions=-1,
        sNum=30,
        modelPath=None,
        bWriteMonitor=False,
        writeFileOutput=False):

    # import monitoring if needed
    if bWriteMonitor == True:
        import plotting

    #Create monitoring plot and associated vectors
    mPX = range(8)
    mPY = [0.] * 8
    mSL = [
        "Create Feature Map", "Read in Data", "Aggregate for M.L.",
        "Read in Model", "Apply Model", "Output Results"
    ]
    mInd = 0

    t0 = time.time()
    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict", diff

    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ', inputFile
    print 'inputPartitions ', inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in data", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Format data for ML input
    t1 = time.time()
    mlApply = None
    if bByDate:
        mlApply = records.map(lambda x: (x.key, [
            LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt
        ])).cache()
    else:
        mlApply = records.map(lambda x: (x.key, [
            LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
        ])).cache()
    nApp = mlApply.count()
    t2 = time.time()
    print "Number of collapsed points:", nApp
    diff = t2 - t1
    print "Time to map points", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Read in Model
    t1 = time.time()
    model_Tree = RandomForestModel.load(sc, modelPath)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read in model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2 - t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(bByDate, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 [])
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet

示例#6

0

显示文件

文件： findSimilarEvent.py 项目： theseusyang/GEQE

def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop !='' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.]*7
    mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good points:", nGoodTweets
    diff = t2-t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)


    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 =  sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2-t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn  = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10.*nSignal/nBack
    (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2-t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2-t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2-t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet

示例#7

0

显示文件

文件： refindSimilarPlaces.py 项目： theseusyang/GEQE

def run(jobNm, sc, sqlContext, inputFile, dictFile,
        bByDate=False,
        inputPartitions=-1,
        sNum=30,
        modelPath=None,
        bWriteMonitor=False,
        writeFileOutput=False):

    # import monitoring if needed
    if bWriteMonitor==True:
        import plotting

    #Create monitoring plot and associated vectors
    mPX = range(8)
    mPY = [0.]*8
    mSL = ["Create Feature Map", "Read in Data", "Aggregate for M.L.", "Read in Model", "Apply Model", "Output Results"]
    mInd = 0

    t0 = time.time()
    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile,"r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2-t1
    print "Time to read dict", diff

    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ',inputFile
    print 'inputPartitions ',inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:",nGoodTweets
    diff = t2-t1
    print "Time to read in data", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Format data for ML input
    t1 = time.time()
    mlApply = None
    if bByDate:
        mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    else:
        mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    nApp = mlApply.count()
    t2 = time.time()
    print "Number of collapsed points:", nApp
    diff = t2-t1
    print "Time to map points", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Read in Model
    t1 = time.time()
    model_Tree = RandomForestModel.load(sc, modelPath)
    t2 = time.time()
    diff = t2-t1
    print "Time to read in model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2-t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(bByDate, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, [])
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet

示例#8

0

显示文件

def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop != '' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.] * 7
    mSL = [
        "Initial Read", "Calculate IDF", "Partition for M.L.",
        "Create Training Vector", "Train Model", "Apply Model",
        "Prepare Output Data"
    ]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ', inputFile
    print 'inputPartitions ', inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict:", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i.
    t1 = time.time()
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    df1.registerTempTable("df1")
    nIn = df1.count()
    dfn1 = sqlContext.sql(
        "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    dfn1.registerTempTable("dfn1")
    nOut = dfn1.count()
    modelDict = aggregatedComparison.exemplarDict(df1, revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to find in and out of ROI", diff
    print "N in:", nIn, ", N out:", nOut
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data, and sample of out region data
    t1 = time.time()
    #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0)
    #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0)
    #nSignal = float(grouped.count())
    #nBack = float(grouped2.count())
    groupedIn = df1.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    scaleFactor = (10. * nIn) / float(nOut)
    (mlApply,
     groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor])
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    nTotTrain = mlTrain.count()
    mlApply.cache()
    nApply = mlApply.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2 - t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]),
                                             categoricalFeaturesInfo={},
                                             numTrees=100,
                                             featureSubsetStrategy="auto",
                                             impurity="variance",
                                             maxDepth=4,
                                             maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2 - t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2 - t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(False, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 modelDict)
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet