예제 #1
0
def megaVector(ittRec, bc_dIDF, bUserStopFilter, bc_lStopWords, nTot):
    dThisPoint = {}
    dUsableWords = bc_dIDF.value
    norm = 0.0
    for post in ittRec:
        for w in fspLib.wordBreak(post.text, bUserStopFilter, bc_lStopWords):
            if w in dUsableWords:
                if dUsableWords[w][0] != 0:
                    val = log10((1.*nTot)/dUsableWords[w][0])
                    norm = norm + val
                    dThisPoint[w] = dThisPoint[w] + val if w in dThisPoint.keys() else val
    scoreNPos = []
    if norm == 0.0:
        return SparseVector(len(dUsableWords.keys()),[],[])
    for term, num in dThisPoint.iteritems():
        scoreNPos.append((dUsableWords[term][0],num/norm))
    scoreNPos.sort(key=lambda x:x[0])
    lPos, lVal = zip(*scoreNPos)
    return SparseVector(len(dUsableWords.keys()), list(lPos), list(lVal))
예제 #2
0
def megaVector(ittRec, bc_dIDF, bUserStopFilter, bc_lStopWords, nTot):
    dThisPoint = {}
    dUsableWords = bc_dIDF.value
    norm = 0.0
    for post in ittRec:
        for w in fspLib.wordBreak(post.text, bUserStopFilter, bc_lStopWords):
            if w in dUsableWords:
                if dUsableWords[w][0] != 0:
                    val = log10((1. * nTot) / dUsableWords[w][0])
                    norm = norm + val
                    dThisPoint[w] = dThisPoint[
                        w] + val if w in dThisPoint.keys() else val
    scoreNPos = []
    if norm == 0.0:
        return SparseVector(len(dUsableWords.keys()), [], [])
    for term, num in dThisPoint.iteritems():
        scoreNPos.append((dUsableWords[term][0], num / norm))
    scoreNPos.sort(key=lambda x: x[0])
    lPos, lVal = zip(*scoreNPos)
    return SparseVector(len(dUsableWords.keys()), list(lPos), list(lVal))
예제 #3
0
def locationBasedOutput(bByDate, jobNm, vecAndPredictions, sNum, fBinSize, revLookup, bUseStopFilter, bc_lStopWords):
    nNonZeros = vecAndPredictions.count()
    nToTake = int(nNonZeros*sNum)
    if sNum>1.:
        nToTake=int(sNum)
    outCol = vecAndPredictions.sortBy(lambda x: x[1], False).take(nToTake)

    if bByDate == True:
        datesJson = {}
        for point in outCol:
            record = point[0][1]
            breakPoint = record[0].find("_",9)
            offset = fBinSize/2.
            lat = float(record[0][9:breakPoint])
            if lat > 0:
                lat = lat + offset
            else:
                lat = lat - offset
            lon = float(record[0][breakPoint+1:])
            if lon > 0:
                lon = lon + offset
            else:
                lon = lon - offset
            sdate = record[0][:8]
            sdate = sdate[0:4]+"-"+sdate[4:6]+"-"+sdate[6:]
            if sdate not in datesJson.keys():
                datesJson[sdate] = {"clusters":[]}
            thisCluster = {"nUnique":5,"background":100, "nTotal":len(record[1]), "lon":lon, "lat":lat, "date":sdate, "posts":[], "score":point[1]}
            labeledP = point[0][0]
            tups = zip(labeledP.features.values, labeledP.features.indices)
            thisDict = set(map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5]))
            thisCluster["dict"] = list(thisDict)
            for post in record[1]:
                includePost = False
                for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords):
                    if w in thisDict:
                        includePost = True
                        break
                if includePost:
                    thisPost = {"sco":1,"cap":post.text,"lon":post.lon,"lat":post.lat,"date":post.dt.strftime("%Y-%m-%d"),"usr":post.user,"source":post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")}
                    thisCluster["posts"].append(thisPost)
            thisCluster["poly"] = [[lat+offset,lon+offset],[lat+offset,lon-offset],[lat-offset,lon-offset],[lat-offset,lon+offset]]
            datesJson[sdate]["clusters"].append(thisCluster)

        retDict = {"type":"event", "dates":datesJson}
        with codecs.open("scoreFiles/"+jobNm, encoding="utf-8",mode="wb") as fOut:
            json.dump(retDict, fOut)
        return retDict
    else:
        clusterList = []
        for point in outCol:
            record = point[0][1]
            breakPoint = record[0].find("_")
            offset = fBinSize/2.
            lat = float(record[0][:breakPoint])
            if lat > 0:
                lat = lat + offset
            else:
                lat = lat - offset
            lon = float(record[0][breakPoint+1:])
            if lon > 0:
                lon = lon + offset
            else:
                lon = lon - offset
            thisCluster = {"nUnique":5,"background":100, "nTotal":len(record[1]), "lon":lon, "lat":lat, "posts":[], "score":point[1]}
            labeledP = point[0][0]
            tups = zip(labeledP.features.values, labeledP.features.indices)
            thisDict = set(map(lambda x: revLookup[x[1]], sorted(tups, key=lambda x: x[0], reverse=True)[:5]))
            thisCluster["dict"] = list(thisDict)
            nPosts = 0
            for post in record[1]:
                includePost = False
                for w in fspLib.wordBreak(post.text, bUseStopFilter, bc_lStopWords):
                    if w in thisDict:
                        includePost = True
                        break
                if includePost:
                    nPosts = nPosts + 1
                    thisPost = {"sco":1,"cap":post.text,"lon":post.lon,"lat":post.lat,"date":post.dt.strftime("%Y-%m-%d"),"usr":post.user,"source":post.source, "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")}
                    thisCluster["posts"].append(thisPost)
                if nPosts >= 100:
                    break
            thisCluster["poly"] = [[lat+offset,lon+offset],[lat+offset,lon-offset],[lat-offset,lon-offset],[lat-offset,lon+offset]]
            clusterList.append(thisCluster)

        retDict = {"type":"place", "clusters":clusterList}
        with codecs.open("scoreFiles/"+jobNm, encoding="utf-8",mode="wb") as fOut:
            json.dump(retDict, fOut)

        return retDict
예제 #4
0
파일: findIDF.py 프로젝트: theseusyang/GEQE
    nDataType = 1
    if args.datTyp:
        nDataType = args.datTyp

    cNum = 1200
    if args.cNum:
        cNum = args.cNum

    bUseStopFilter = False
    #Declare Spark Context
    conf = SparkConf().setAppName(jobNm)
    sc = SparkContext(conf = conf)

    lowerTime = date(2006,03,21)
    upperTime = date(3000,01,01)
    bc_lStopWords = None

    input = sc.textFile(inputFile,100).coalesce(cNum)
    data = input.map(lambda x: fspLib.loadRecord(x,nDataType)).filter(lambda x: x is not None)
    df0 = data.filter(lambda x: fspLib.badData(x, bUseStopFilter, bc_lStopWords, lowerTime, upperTime))
    wb  = df0.flatMap(lambda x: fspLib.wordBreak(x['caption'], bUseStopFilter, bc_lStopWords))
    wc  = wb.map(lambda x: (x,1)).reduceByKey(add)
    wordAndCound = wc.collect()
    fDict = codecs.open("./idfFiles/" + dictFile, encoding="utf-8", mode="w")
    for cT in wordAndCound:
        fDict.write(u"\t".join([unicode(cT[0]),unicode(cT[1])])+u"\n")

    fDict.close()

예제 #5
0
    if args.datTyp:
        nDataType = args.datTyp

    cNum = 1200
    if args.cNum:
        cNum = args.cNum

    bUseStopFilter = False
    #Declare Spark Context
    conf = SparkConf().setAppName(jobNm)
    sc = SparkContext(conf=conf)

    lowerTime = date(2006, 03, 21)
    upperTime = date(3000, 01, 01)
    bc_lStopWords = None

    input = sc.textFile(inputFile, 100).coalesce(cNum)
    data = input.map(lambda x: fspLib.loadRecord(x, nDataType)).filter(
        lambda x: x is not None)
    df0 = data.filter(lambda x: fspLib.badData(
        x, bUseStopFilter, bc_lStopWords, lowerTime, upperTime))
    wb = df0.flatMap(lambda x: fspLib.wordBreak(x['caption'], bUseStopFilter,
                                                bc_lStopWords))
    wc = wb.map(lambda x: (x, 1)).reduceByKey(add)
    wordAndCound = wc.collect()
    fDict = codecs.open("./idfFiles/" + dictFile, encoding="utf-8", mode="w")
    for cT in wordAndCound:
        fDict.write(u"\t".join([unicode(cT[0]), unicode(cT[1])]) + u"\n")

    fDict.close()
예제 #6
0
def locationBasedOutput(bByDate, jobNm, vecAndPredictions, sNum, fBinSize,
                        revLookup, bUseStopFilter, bc_lStopWords):
    nNonZeros = vecAndPredictions.count()
    nToTake = int(nNonZeros * sNum)
    if sNum > 1.:
        nToTake = int(sNum)
    outCol = vecAndPredictions.sortBy(lambda x: x[1], False).take(nToTake)

    if bByDate == True:
        datesJson = {}
        for point in outCol:
            record = point[0][1]
            breakPoint = record[0].find("_", 9)
            offset = fBinSize / 2.
            lat = float(record[0][9:breakPoint])
            if lat > 0:
                lat = lat + offset
            else:
                lat = lat - offset
            lon = float(record[0][breakPoint + 1:])
            if lon > 0:
                lon = lon + offset
            else:
                lon = lon - offset
            sdate = record[0][:8]
            sdate = sdate[0:4] + "-" + sdate[4:6] + "-" + sdate[6:]
            if sdate not in datesJson.keys():
                datesJson[sdate] = {"clusters": []}
            thisCluster = {
                "nUnique": 5,
                "background": 100,
                "nTotal": len(record[1]),
                "lon": lon,
                "lat": lat,
                "date": sdate,
                "posts": [],
                "score": point[1]
            }
            labeledP = point[0][0]
            tups = zip(labeledP.features.values, labeledP.features.indices)
            thisDict = set(
                map(lambda x: revLookup[x[1]],
                    sorted(tups, key=lambda x: x[0], reverse=True)[:5]))
            thisCluster["dict"] = list(thisDict)
            for post in record[1]:
                includePost = False
                for w in fspLib.wordBreak(post.text, bUseStopFilter,
                                          bc_lStopWords):
                    if w in thisDict:
                        includePost = True
                        break
                if includePost:
                    thisPost = {
                        "sco": 1,
                        "cap": post.text,
                        "lon": post.lon,
                        "lat": post.lat,
                        "date": post.dt.strftime("%Y-%m-%d"),
                        "usr": post.user,
                        "source": post.source,
                        "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")
                    }
                    thisCluster["posts"].append(thisPost)
            thisCluster["poly"] = [[lat + offset, lon + offset],
                                   [lat + offset, lon - offset],
                                   [lat - offset, lon - offset],
                                   [lat - offset, lon + offset]]
            datesJson[sdate]["clusters"].append(thisCluster)

        retDict = {"type": "event", "dates": datesJson}
        with codecs.open("scoreFiles/" + jobNm, encoding="utf-8",
                         mode="wb") as fOut:
            json.dump(retDict, fOut)
        return retDict
    else:
        clusterList = []
        for point in outCol:
            record = point[0][1]
            breakPoint = record[0].find("_")
            offset = fBinSize / 2.
            lat = float(record[0][:breakPoint])
            if lat > 0:
                lat = lat + offset
            else:
                lat = lat - offset
            lon = float(record[0][breakPoint + 1:])
            if lon > 0:
                lon = lon + offset
            else:
                lon = lon - offset
            thisCluster = {
                "nUnique": 5,
                "background": 100,
                "nTotal": len(record[1]),
                "lon": lon,
                "lat": lat,
                "posts": [],
                "score": point[1]
            }
            labeledP = point[0][0]
            tups = zip(labeledP.features.values, labeledP.features.indices)
            thisDict = set(
                map(lambda x: revLookup[x[1]],
                    sorted(tups, key=lambda x: x[0], reverse=True)[:5]))
            thisCluster["dict"] = list(thisDict)
            nPosts = 0
            for post in record[1]:
                includePost = False
                for w in fspLib.wordBreak(post.text, bUseStopFilter,
                                          bc_lStopWords):
                    if w in thisDict:
                        includePost = True
                        break
                if includePost:
                    nPosts = nPosts + 1
                    thisPost = {
                        "sco": 1,
                        "cap": post.text,
                        "lon": post.lon,
                        "lat": post.lat,
                        "date": post.dt.strftime("%Y-%m-%d"),
                        "usr": post.user,
                        "source": post.source,
                        "datetime": post.dt.strftime("%Y-%m-%d %H:%M:%S")
                    }
                    thisCluster["posts"].append(thisPost)
                if nPosts >= 100:
                    break
            thisCluster["poly"] = [[lat + offset, lon + offset],
                                   [lat + offset, lon - offset],
                                   [lat - offset, lon - offset],
                                   [lat - offset, lon + offset]]
            clusterList.append(thisCluster)

        retDict = {"type": "place", "clusters": clusterList}
        with codecs.open("scoreFiles/" + jobNm, encoding="utf-8",
                         mode="wb") as fOut:
            json.dump(retDict, fOut)

        return retDict