def megaVector(ittRec, bc_dIDF, bUserStopFilter, bc_lStopWords, nTot): dThisPoint = {} dUsableWords = bc_dIDF.value norm = 0.0 for post in ittRec: for w in fspLib.wordBreak(post.text, bUserStopFilter, bc_lStopWords): if w in dUsableWords: if dUsableWords[w][0] != 0: val = log10((1.*nTot)/dUsableWords[w][0]) norm = norm + val dThisPoint[w] = dThisPoint[w] + val if w in dThisPoint.keys() else val scoreNPos = [] if norm == 0.0: return SparseVector(len(dUsableWords.keys()),[],[]) for term, num in dThisPoint.iteritems(): scoreNPos.append((dUsableWords[term][0],num/norm)) scoreNPos.sort(key=lambda x:x[0]) lPos, lVal = zip(*scoreNPos) return SparseVector(len(dUsableWords.keys()), list(lPos), list(lVal))
def megaVector(ittRec, bc_dIDF, bUserStopFilter, bc_lStopWords, nTot): dThisPoint = {} dUsableWords = bc_dIDF.value norm = 0.0 for post in ittRec: for w in fspLib.wordBreak(post.text, bUserStopFilter, bc_lStopWords): if w in dUsableWords: if dUsableWords[w][0] != 0: val = log10((1. * nTot) / dUsableWords[w][0]) norm = norm + val dThisPoint[w] = dThisPoint[ w] + val if w in dThisPoint.keys() else val scoreNPos = [] if norm == 0.0: return SparseVector(len(dUsableWords.keys()), [], []) for term, num in dThisPoint.iteritems(): scoreNPos.append((dUsableWords[term][0], num / norm)) scoreNPos.sort(key=lambda x: x[0]) lPos, lVal = zip(*scoreNPos) return SparseVector(len(dUsableWords.keys()), list(lPos), list(lVal))
def locationBasedOutput(bByDate, jobNm, vecAndPredictions, sNum, fBinSize, revLookup, bUseStopFilter, bc_lStopWords): nNonZeros = vecAndPredictions.count() nToTake = int(nNonZeros*sNum) if sNum>1.: nToTake=int(sNum) outCol = vecAndPredictions.sortBy(lambda x: x[1], False).take(nToTake) if bByDate == True: datesJson = {} for point in outCol: record = point[0][1] breakPoint = record[0].find("_",9) offset = fBinSize/2. lat = float(record[0][9:breakPoint]) if lat > 0: lat = lat + offset else: lat = lat - offset lon = float(record[0][breakPoint+1:]) if lon > 0: lon = lon + offset else: lon = lon - offset sdate = record[0][:8] sdate = sdate[0:4]+"-"+sdate[4:6]+"-"+sdate[6:] if sdate not in datesJson.keys(): datesJson[sdate] = {"clusters":[]} thisCluster = {"nUnique":5,"background":100, "nTotal":len(record[1]), "lon":lon, "lat":lat, "date":sdate, "posts":[], "score":point[1]} labeledP = point[0][0] tups = zip(labeledP.features.values, labeledP.features.indices) thisDict = set(map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5])) thisCluster["dict"] = list(thisDict) for post in record[1]: includePost = False for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords): if w in thisDict: includePost = True break if includePost: thisPost = {"sco":1,"cap":post.text,"lon":post.lon,"lat":post.lat,"date":post.dt.strftime("%Y-%m-%d"),"usr":post.user,"source":post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")} thisCluster["posts"].append(thisPost) thisCluster["poly"] = [[lat+offset,lon+offset],[lat+offset,lon-offset],[lat-offset,lon-offset],[lat-offset,lon+offset]] datesJson[sdate]["clusters"].append(thisCluster) retDict = {"type":"event", "dates":datesJson} with codecs.open("scoreFiles/"+jobNm, encoding="utf-8",mode="wb") as fOut: json.dump(retDict, fOut) return retDict else: clusterList = [] for point in outCol: record = point[0][1] breakPoint = record[0].find("_") offset = fBinSize/2. lat = float(record[0][:breakPoint]) if lat > 0: lat = lat + offset else: lat = lat - offset lon = float(record[0][breakPoint+1:]) if lon > 0: lon = lon + offset else: lon = lon - offset thisCluster = {"nUnique":5,"background":100, "nTotal":len(record[1]), "lon":lon, "lat":lat, "posts":[], "score":point[1]} labeledP = point[0][0] tups = zip(labeledP.features.values, labeledP.features.indices) thisDict = set(map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5])) thisCluster["dict"] = list(thisDict) nPosts = 0 for post in record[1]: includePost = False for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords): if w in thisDict: includePost = True break if includePost: nPosts = nPosts + 1 thisPost = {"sco":1,"cap":post.text,"lon":post.lon,"lat":post.lat,"date":post.dt.strftime("%Y-%m-%d"),"usr":post.user,"source":post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")} thisCluster["posts"].append(thisPost) if nPosts >= 100: break thisCluster["poly"] = [[lat+offset,lon+offset],[lat+offset,lon-offset],[lat-offset,lon-offset],[lat-offset,lon+offset]] clusterList.append(thisCluster) retDict = {"type":"place", "clusters":clusterList} with codecs.open("scoreFiles/"+jobNm, encoding="utf-8",mode="wb") as fOut: json.dump(retDict, fOut) return retDict
nDataType = 1 if args.datTyp: nDataType = args.datTyp cNum = 1200 if args.cNum: cNum = args.cNum bUseStopFilter = False #Declare Spark Context conf = SparkConf().setAppName(jobNm) sc = SparkContext(conf = conf) lowerTime = date(2006,03,21) upperTime = date(3000,01,01) bc_lStopWords = None input = sc.textFile(inputFile,100).coalesce(cNum) data = input.map(lambda x: fspLib.loadRecord(x,nDataType)).filter(lambda x: x is not None) df0 = data.filter(lambda x: fspLib.badData(x, bUseStopFilter, bc_lStopWords, lowerTime, upperTime)) wb = df0.flatMap(lambda x: fspLib.wordBreak(x['caption'], bUseStopFilter, bc_lStopWords)) wc = wb.map(lambda x: (x,1)).reduceByKey(add) wordAndCound = wc.collect() fDict = codecs.open("./idfFiles/" + dictFile, encoding="utf-8", mode="w") for cT in wordAndCound: fDict.write(u"\t".join([unicode(cT[0]),unicode(cT[1])])+u"\n") fDict.close()
if args.datTyp: nDataType = args.datTyp cNum = 1200 if args.cNum: cNum = args.cNum bUseStopFilter = False #Declare Spark Context conf = SparkConf().setAppName(jobNm) sc = SparkContext(conf=conf) lowerTime = date(2006, 03, 21) upperTime = date(3000, 01, 01) bc_lStopWords = None input = sc.textFile(inputFile, 100).coalesce(cNum) data = input.map(lambda x: fspLib.loadRecord(x, nDataType)).filter( lambda x: x is not None) df0 = data.filter(lambda x: fspLib.badData( x, bUseStopFilter, bc_lStopWords, lowerTime, upperTime)) wb = df0.flatMap(lambda x: fspLib.wordBreak(x['caption'], bUseStopFilter, bc_lStopWords)) wc = wb.map(lambda x: (x, 1)).reduceByKey(add) wordAndCound = wc.collect() fDict = codecs.open("./idfFiles/" + dictFile, encoding="utf-8", mode="w") for cT in wordAndCound: fDict.write(u"\t".join([unicode(cT[0]), unicode(cT[1])]) + u"\n") fDict.close()
def locationBasedOutput(bByDate, jobNm, vecAndPredictions, sNum, fBinSize, revLookup, bUseStopFilter, bc_lStopWords): nNonZeros = vecAndPredictions.count() nToTake = int(nNonZeros * sNum) if sNum > 1.: nToTake = int(sNum) outCol = vecAndPredictions.sortBy(lambda x: x[1], False).take(nToTake) if bByDate == True: datesJson = {} for point in outCol: record = point[0][1] breakPoint = record[0].find("_", 9) offset = fBinSize / 2. lat = float(record[0][9:breakPoint]) if lat > 0: lat = lat + offset else: lat = lat - offset lon = float(record[0][breakPoint + 1:]) if lon > 0: lon = lon + offset else: lon = lon - offset sdate = record[0][:8] sdate = sdate[0:4] + "-" + sdate[4:6] + "-" + sdate[6:] if sdate not in datesJson.keys(): datesJson[sdate] = {"clusters": []} thisCluster = { "nUnique": 5, "background": 100, "nTotal": len(record[1]), "lon": lon, "lat": lat, "date": sdate, "posts": [], "score": point[1] } labeledP = point[0][0] tups = zip(labeledP.features.values, labeledP.features.indices) thisDict = set( map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5])) thisCluster["dict"] = list(thisDict) for post in record[1]: includePost = False for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords): if w in thisDict: includePost = True break if includePost: thisPost = { "sco": 1, "cap": post.text, "lon": post.lon, "lat": post.lat, "date": post.dt.strftime("%Y-%m-%d"), "usr": post.user, "source": post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S") } thisCluster["posts"].append(thisPost) thisCluster["poly"] = [[lat + offset, lon + offset], [lat + offset, lon - offset], [lat - offset, lon - offset], [lat - offset, lon + offset]] datesJson[sdate]["clusters"].append(thisCluster) retDict = {"type": "event", "dates": datesJson} with codecs.open("scoreFiles/" + jobNm, encoding="utf-8", mode="wb") as fOut: json.dump(retDict, fOut) return retDict else: clusterList = [] for point in outCol: record = point[0][1] breakPoint = record[0].find("_") offset = fBinSize / 2. lat = float(record[0][:breakPoint]) if lat > 0: lat = lat + offset else: lat = lat - offset lon = float(record[0][breakPoint + 1:]) if lon > 0: lon = lon + offset else: lon = lon - offset thisCluster = { "nUnique": 5, "background": 100, "nTotal": len(record[1]), "lon": lon, "lat": lat, "posts": [], "score": point[1] } labeledP = point[0][0] tups = zip(labeledP.features.values, labeledP.features.indices) thisDict = set( map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5])) thisCluster["dict"] = list(thisDict) nPosts = 0 for post in record[1]: includePost = False for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords): if w in thisDict: includePost = True break if includePost: nPosts = nPosts + 1 thisPost = { "sco": 1, "cap": post.text, "lon": post.lon, "lat": post.lat, "date": post.dt.strftime("%Y-%m-%d"), "usr": post.user, "source": post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S") } thisCluster["posts"].append(thisPost) if nPosts >= 100: break thisCluster["poly"] = [[lat + offset, lon + offset], [lat + offset, lon - offset], [lat - offset, lon - offset], [lat - offset, lon + offset]] clusterList.append(thisCluster) retDict = {"type": "place", "clusters": clusterList} with codecs.open("scoreFiles/" + jobNm, encoding="utf-8", mode="wb") as fOut: json.dump(retDict, fOut) return retDict