def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile, inputPartitions=-1, writeFileOutput=True, bByDate=False, strStop='', modelName='random forest', num_features=-1): stopSet = set(strStop.split(',')) if strStop !='' else set() t0 = time.time() #Read in data t1 = time.time() records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions).cache() nGoodTweets = records.count() diff = time.time() - t1 print "GEQE: Time to read in data:", diff #Read in dictionary t1 = time.time() revLookup = [] lStop = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile,"r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) diff = time.time() - t1 print "GEQE: Time to read in dict:", diff tAndP, nInApply, nOutApply = None, 0, 0 if bByDate == True: print "GEQE: Generating event model" else: print "GEQE: Generating location model" (tAndP, nInApply, nOutApply) = locationTest(sc, sqlContext, lPolygon, lStop,modelName=modelName,num_features=num_features) t1 = time.time() print "GEQE: Generating ROC from Truth and predictions" plotting.generateROCCurve(tAndP,nInApply,nOutApply,jobNm) diff = time.time() - t1 print "GEQE: Time to make ROC:", diff
############# ############# ############# ############# #############
def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile, inputPartitions=-1, writeFileOutput=True, bByDate=False, strStop='', modelName='random forest', num_features=-1): stopSet = set(strStop.split(',')) if strStop != '' else set() t0 = time.time() #Read in data t1 = time.time() records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions).cache() nGoodTweets = records.count() diff = time.time() - t1 print "GEQE: Time to read in data:", diff #Read in dictionary t1 = time.time() revLookup = [] lStop = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile, "r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) diff = time.time() - t1 print "GEQE: Time to read in dict:", diff tAndP, nInApply, nOutApply = None, 0, 0 if bByDate == True: print "GEQE: Generating event model" else: print "GEQE: Generating location model" (tAndP, nInApply, nOutApply) = locationTest(sc, sqlContext, lPolygon, lStop, modelName=modelName, num_features=num_features) t1 = time.time() print "GEQE: Generating ROC from Truth and predictions" plotting.generateROCCurve(tAndP, nInApply, nOutApply, jobNm) diff = time.time() - t1 print "GEQE: Time to make ROC:", diff
def run(jobNm, sc, sqlContext, inputFile, dictFile, bByDate=False, inputPartitions=-1, sNum=30, modelPath=None, bWriteMonitor=False, writeFileOutput=False): # import monitoring if needed if bWriteMonitor == True: import plotting #Create monitoring plot and associated vectors mPX = range(8) mPY = [0.] * 8 mSL = [ "Create Feature Map", "Read in Data", "Aggregate for M.L.", "Read in Model", "Apply Model", "Output Results" ] mInd = 0 t0 = time.time() #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile, "r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) nVecLen = len(revLookup) t2 = time.time() diff = t2 - t1 print "Time to read dict", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Read in data and filter out entries with no valid words t1 = time.time() print 'inputFile ', inputFile print 'inputPartitions ', inputPartitions records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:", nGoodTweets diff = t2 - t1 print "Time to read in data", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Format data for ML input t1 = time.time() mlApply = None if bByDate: mlApply = records.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt ])).cache() else: mlApply = records.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() nApp = mlApply.count() t2 = time.time() print "Number of collapsed points:", nApp diff = t2 - t1 print "Time to map points", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Read in Model t1 = time.time() model_Tree = RandomForestModel.load(sc, modelPath) t2 = time.time() diff = t2 - t1 print "Time to read in model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # apply model t1 = time.time() predictions_Tree = model_Tree.predict( mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() diff = t2 - t1 print "Time to apply model: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(bByDate, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, []) t2 = time.time() diff = t2 - t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop !='' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.]*7 mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good points:", nGoodTweets diff = t2-t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile,"r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2-t1 print "Time to read dict: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, as well as prepare application data # i.) In both the region, and in the time window # ii.) In the region, but outside the time window # iii.) Out of region, data to apply model to t1 = time.time() sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType()) df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache() df1.registerTempTable("df1") df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() dfn1 = sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)") df1_inTime.registerTempTable("df1_inTime") #df1_outTime.registerTempTable("df1_outTime") #nL1T1 = df1_inTime.count() #nL1T0 = df1_outTime.count() exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup) t2 = time.time() #print nL1T1, "events in region in time,", nL1T0, "events in region out of time" diff = t2-t1 print "Time to partition by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data t1 = time.time() groupedIn = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache() nSignal = float(groupedIn.count()) nBack = float(groupedOut.count()) scaleFactor = 10.*nSignal/nBack (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor]) mlApply.cache() mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop)) mlTrain.cache() nTotTrain = mlTrain.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2-t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2-t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Apply Model to out of region data t1 = time.time() predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() #print "Number of points to score:", nApply diff = t2-t1 print "Time aggregate and label points: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict) t2 = time.time() diff = t2-t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
def run(jobNm, sc, sqlContext, inputFile, dictFile, bByDate=False, inputPartitions=-1, sNum=30, modelPath=None, bWriteMonitor=False, writeFileOutput=False): # import monitoring if needed if bWriteMonitor==True: import plotting #Create monitoring plot and associated vectors mPX = range(8) mPY = [0.]*8 mSL = ["Create Feature Map", "Read in Data", "Aggregate for M.L.", "Read in Model", "Apply Model", "Output Results"] mInd = 0 t0 = time.time() #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile,"r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) nVecLen = len(revLookup) t2 = time.time() diff = t2-t1 print "Time to read dict", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Read in data and filter out entries with no valid words t1 = time.time() print 'inputFile ',inputFile print 'inputPartitions ',inputPartitions records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:",nGoodTweets diff = t2-t1 print "Time to read in data", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Format data for ML input t1 = time.time() mlApply = None if bByDate: mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache() else: mlApply = records.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() nApp = mlApply.count() t2 = time.time() print "Number of collapsed points:", nApp diff = t2-t1 print "Time to map points", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Read in Model t1 = time.time() model_Tree = RandomForestModel.load(sc, modelPath) t2 = time.time() diff = t2-t1 print "Time to read in model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # apply model t1 = time.time() predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() diff = t2-t1 print "Time to apply model: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(bByDate, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, []) t2 = time.time() diff = t2-t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop != '' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.] * 7 mSL = [ "Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data" ] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() print 'inputFile ', inputFile print 'inputPartitions ', inputPartitions records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:", nGoodTweets diff = t2 - t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile, "r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2 - t1 print "Time to read dict:", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i. t1 = time.time() sqlContext.registerFunction( "inRegionOfInterest", lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons), returnType=BooleanType()) df1 = sqlContext.sql( "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)" ).cache() df1.registerTempTable("df1") nIn = df1.count() dfn1 = sqlContext.sql( "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)" ).cache() dfn1.registerTempTable("dfn1") nOut = dfn1.count() modelDict = aggregatedComparison.exemplarDict(df1, revLookup) t2 = time.time() diff = t2 - t1 print "Time to find in and out of ROI", diff print "N in:", nIn, ", N out:", nOut if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data, and sample of out region data t1 = time.time() #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0) #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0) #nSignal = float(grouped.count()) #nBack = float(grouped2.count()) groupedIn = df1.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() groupedOut = dfn1.map(lambda x: (x.key, [ LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() scaleFactor = (10. * nIn) / float(nOut) (mlApply, groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor]) mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map( lambda x: aggregatedComparison.removeStopWords(x, lStop)) nTotTrain = mlTrain.count() mlApply.cache() nApply = mlApply.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2 - t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2 - t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # apply model t1 = time.time() predictions_Tree = model_Tree.predict( mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() diff = t2 - t1 print "Time to apply model: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(False, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, modelDict) t2 = time.time() diff = t2 - t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet