Пример #1
0
#!/usr/bin/env python

import sys
from sets import Set

true_dups = Set()
reported_dups = Set()

if len(sys.argv) < 3:
    print "Usage: python check.py reported_duplicates true_duplicates"
    exit(-1)

reported_duplicates_file = sys.argv[1]
true_duplicates_file = sys.argv[2]

with open(true_duplicates_file, "r") as inf:
    for line in inf:
        true_dups.add(line.strip())

with open(reported_duplicates_file, "r") as inf:
    for line in inf:
        reported_dups.add(line.strip())

tp, fp, fn = [0, 0, 0]

for pair in reported_dups:
    if pair in true_dups:
        tp += 1
    else:
        fp += 1
Пример #2
0
#!/opt/bb/bin/python

import sys
from sets import Set

program = {}
counts = Set()
for line in sys.stdin:
    line = line.rstrip('\n').split(")")
    name = line[0].split()[0]
    weight = int(line[0].split("(")[1])
    words = line[1].split(", ")
    if words == ['']: words = []
    else: words[0] = words[0][4:]
    program[name] = {'weight': weight, 'words': words, 'total': 0}
    for w in words:
        counts.add(w)

base = ""

for p in program.keys():
    if p not in counts:
        base = p
        break

print base
problem = 0


def getWeight(name):
    p = program[name]
Пример #3
0
def core3(swLng, swLat, neLng, neLat, selectUid, rawkeywords, stime, etime):
    startTime = timer.time()
    timelist = []
    keywords = rawkeywords.split(',')

    def KM(k, p):
        category = POIDict[p]['category']
        if k == category:
            tf = 1
        else:
            return 0

        idf = FBCategoryPOICountDict[k]
        AT = FB_ATDict[k]
        score = (tf / float(idf)) * AT
        return score

    def cal_re_in(select_rid, tuples):
        Total_PATS = 0.0
        Total_Timescore = 0.0
        Total_SocialINF = 0.0
        Total_KM = 0.0
        AllPOI = []

        if len(tuples) <= POILength:
            return
        for t in tuples:
            Total_PATS += t[2]
            Total_Timescore += t[3]
            Total_SocialINF += t[4]
            pid = t[0]
            time = t[1]
            lat = POIDict[pid]['latitude']
            lng = POIDict[pid]['longitude']
            name = POIDict[pid]['name']
            category = POIDict[pid]['category']
            link = POIDict[pid]['link']
            likes = POIDict[pid]['likes']
            checkins = POIDict[pid]['checkins']

            for keyword in keywords:
                Total_KM += KM(keyword, pid)

            POITuple = {
                'pid': pid,
                'time': time,
                'coor': [lat, lng],
                'name': name,
                'category': category,
                'link': link,
                'likes': likes,
                'checkins': checkins
            }
            # POITuple = {
            # 'pid': pid, 'time': time, 'coor': [lat, lng], 'name': name}
            AllPOI.append(POITuple)

        rScore = 0.0

        rScore = Total_PATS + Total_Timescore + Total_SocialINF
        POICount = len(tuples)
        cover = 0.0

        avg_rScore = float(rScore) / POICount

        reconstructionFlag = True
        tup = uid, orignal_rid, select_rid, rScore, avg_rScore, 0, 0, 0, 0, 0, 0, 0, Total_PATS, Total_Timescore, Total_SocialINF, float(
            Total_PATS) / POICount, float(Total_Timescore) / POICount, float(
                Total_SocialINF) / POICount, 0, 0, AllPOI, Total_KM
        return tup

    def getMinAndMax(lats, longs):
        return str(min(lats)), str(min(longs)), str(max(lats)), str(max(longs))

    def splitIntoHeadDict(route):
        length = len(route)
        for i in xrange(length):
            if i == length - 1:
                break
            headID = route[i][0]
            tailID = route[i + 1][0]
            head = POIDataDict[headID]
            tail = POIDataDict[tailID]
            if head[0] == tail[0] or head[1] >= tail[1]:
                continue
            if head[0] not in splitDict:
                splitDict[head[0]] = Set([])
                splitDict[head[0]].add((head, tail))
            else:
                splitDict[head[0]].add((head, tail))

    def construct(pairSet):
        endflag = False

        global construct_c
        global reconstructionIdx

        for i in pairSet:

            if len(tempStack) == 0:
                tempStack.append(i[0])
                tempStack.append(i[1])
            else:
                idx = len(tempStack) - 1

                if i[1][1] > tempStack[idx][1] and i[1][0] != tempStack[idx][0]:
                    tempStack.append(i[0])
                    tempStack.append(i[1])
                else:
                    tempStack.append(i[0])
                    endflag = True

            headSet = splitDict.get(i[0][0], None)
            tailSet = splitDict.get(i[1][0], None)

            if tailSet != None and endflag == False:
                tempStack.pop()
                construct(tailSet)

                if len(tempStack) <= 1:
                    pass
                else:
                    x_p = []
                    tempStack_cpy = []
                    score = 0.0
                    for i in tempStack:
                        x_p.append(i[0])
                        tempStack_cpy.append(i)
                        score += (i[2] + i[3] + i[4])

                    if tuple(x_p) not in prefixSet:
                        #make prefix
                        prefix = []
                        length = len(x_p)
                        for i in xrange(length):
                            if i < length - 1:
                                prefix.append(x_p[i])
                                prefixSet.add(tuple(prefix))

                        if tuple(x_p) not in reconstructionInput_p:
                            if tuple(x_p) in orignalRouteScore:
                                if score > orignalRouteScore[tuple(x_p)]:
                                    r_rid = 'Reconstruct_' + str(
                                        reconstructionIdx)
                                    reconstructionOutput.append(
                                        cal_re_in(r_rid, tempStack))
                                    reconstructionIdx += 1
                                    reconstructionInput_p.add(tuple(x_p))
                            else:
                                r_rid = 'Reconstruct_' + str(reconstructionIdx)
                                reconstructionOutput.append(
                                    cal_re_in(r_rid, tempStack))
                                reconstructionIdx += 1
                                reconstructionInput_p.add(tuple(x_p))
                tempStack.pop()
            elif endflag == True:
                if len(tempStack) == 1:
                    pass
                else:
                    x_p = []
                    tempStack_cpy = []
                    score = 0.0
                    for i in tempStack:
                        x_p.append(i[0])
                        tempStack_cpy.append(i)
                        score += (i[2] + i[3] + i[4])

                    if tuple(x_p) not in prefixSet:
                        #make prefix
                        prefix = []
                        length = len(x_p)
                        for i in xrange(length):
                            if i < length - 1:
                                prefix.append(x_p[i])
                                prefixSet.add(tuple(prefix))

                        if tuple(x_p) not in reconstructionInput_p:
                            if tuple(x_p) in orignalRouteScore:
                                if score > orignalRouteScore[tuple(x_p)]:
                                    r_rid = 'Reconstruct_' + str(
                                        reconstructionIdx)
                                    reconstructionOutput.append(
                                        cal_re_in(r_rid, tempStack))
                                    reconstructionIdx += 1
                                    reconstructionInput_p.add(tuple(x_p))
                            else:
                                r_rid = 'Reconstruct_' + str(reconstructionIdx)
                                reconstructionOutput.append(
                                    cal_re_in(r_rid, tempStack))
                                reconstructionIdx += 1
                                reconstructionInput_p.add(tuple(x_p))

            else:
                if len(tempStack) == 1:
                    pass
                else:
                    x_p = []
                    tempStack_cpy = []
                    score = 0.0
                    for i in tempStack:
                        x_p.append(i[0])
                        tempStack_cpy.append(i)
                        score += (i[2] + i[3] + i[4])

                    if tuple(x_p) not in prefixSet:
                        #make prefix
                        prefix = []
                        length = len(x_p)
                        for i in xrange(length):
                            if i < length - 1:
                                prefix.append(x_p[i])
                                prefixSet.add(tuple(prefix))

                        if tuple(x_p) not in reconstructionInput_p:
                            if tuple(x_p) in orignalRouteScore:
                                if score > orignalRouteScore[tuple(x_p)]:
                                    r_rid = 'Reconstruct_' + str(
                                        reconstructionIdx)
                                    reconstructionOutput.append(
                                        cal_re_in(r_rid, tempStack))
                                    reconstructionIdx += 1
                                    reconstructionInput_p.add(tuple(x_p))
                            else:
                                r_rid = 'Reconstruct_' + str(reconstructionIdx)
                                reconstructionOutput.append(
                                    cal_re_in(r_rid, tempStack))
                                reconstructionIdx += 1
                                reconstructionInput_p.add(tuple(x_p))
                        for i in xrange(2):
                            tempStack.pop()
                            if len(tempStack) <= 1:
                                pass
                            else:
                                x_p = []
                                tempStack_cpy = []
                                score = 0.0
                                for i in tempStack:
                                    x_p.append(i[0])
                                    tempStack_cpy.append(i)
                                    score += (i[2] + i[3] + i[4])

                                if tuple(x_p) not in prefixSet:
                                    #make prefix
                                    prefix = []
                                    length = len(x_p)
                                    for i in xrange(length):
                                        if i < length - 1:
                                            prefix.append(x_p[i])
                                            prefixSet.add(tuple(prefix))

                                    if tuple(x_p) not in reconstructionInput_p:
                                        if tuple(x_p) in orignalRouteScore:
                                            if score > orignalRouteScore[tuple(
                                                    x_p)]:
                                                r_rid = 'Reconstruct_' + \
                                                    str(reconstructionIdx)
                                                reconstructionOutput.append(
                                                    cal_re_in(
                                                        r_rid, tempStack))
                                                reconstructionIdx += 1
                                                reconstructionInput_p.add(
                                                    tuple(x_p))
                                        else:
                                            r_rid = 'Reconstruct_' + \
                                                str(reconstructionIdx)
                                            reconstructionOutput.append(
                                                cal_re_in(r_rid, tempStack))
                                            reconstructionIdx += 1
                                            reconstructionInput_p.add(
                                                tuple(x_p))

    # def if_dominate(check, test):
    #     if check == test:
    #         return True
    #     for i in xrange(len(check)):
    #         if check[i] > test[i]:
    #             return True

    #     return False

    # def cal_dominate(input):
    #     check = input[0]
    #     inputd = input[1]
    #     all_dominate = True
    #     for test in inputd:
    #         if if_dominate(check[1], test[1]) == False:
    #             all_dominate = False
    #             return None

    #     return check

    pro = 0.1
    POILength = 2
    conn_string = "host='192.168.100.200' dbname='moonorblue' user='******' password='******'"
    conn = psycopg2.connect(conn_string)
    cur = conn.cursor()

    qByRegion = "SELECT poi,rid FROM fb_route WHERE geom && st_makeenvelope(" + str(
        swLng
    ) + "," + str(swLat) + "," + str(neLng) + "," + str(
        neLat
    ) + ",4326) AND st_area(geom) != 0 AND (st_area(st_intersection(geom,st_makeenvelope(" + str(
        swLng) + "," + str(swLat) + "," + str(neLng) + "," + str(
            neLat) + ",4326)))) != 0;"
    cur.execute(qByRegion)
    qByRegion_rows = [r for r in cur]
    timelist.append('Query:' + str(timer.time() - startTime))
    startTime = timer.time()
    orignalCategory = Set([])
    orignalPOI = Set([])

    minlong = float(swLng)
    minlat = float(swLat)
    maxlong = float(neLng)
    maxlat = float(neLat)

    uid = str(selectUid)
    orignal_rid = 0
    fids = RelationDict.get(uid, [])
    fids = Set(fids)

    reconstruction_start = timer.time()
    splitDict = {}

    reconstructionInput = []
    reconstructionOutput = []
    reconstructionOutputSet = Set([])
    global reconstructionIdx
    reconstructionIdx = 0

    scoreD = {}
    POIScoreDict = {}
    POIDataDict = {}
    c = 0
    qCount = 0
    for r in qByRegion_rows:
        # break if too much result
        if qCount > 10000:
            break
        POIs = eval(r[0])
        rid = r[1]
        localInput = []
        score = 0.0
        for POI in POIs:
            pid = POI['pid']

            latitude = POIDict[pid]['latitude']
            longitude = POIDict[pid]['longitude']
            if latitude >= minlat and latitude <= maxlat and longitude >= minlong and longitude <= maxlong:
                PATS = POI['PATS']
                timescore = POI['timeScore']
                socialINF = 0.0
                KMs = 0.0
                time = int(
                    datetime.datetime.fromtimestamp(float(
                        POI['time'])).strftime('%H')) + 8
                if time > 24:
                    time = time - 24
                category = POIDict[pid]['category']
                visiters = Set(POIDict[pid]['visiters'])
                # select social influnce score
                for v in visiters:
                    if str(v) not in fids:
                        continue
                    new_u = FBsInfIdxDict[uid]
                    new_f = FBsInfIdxDict[v]
                    scores = FBsInfMatrix[new_u][new_f]
                    socialINF += float(scores)
                for keyword in keywords:
                    KMs += KM(keyword, pid)
                score += (PATS + timescore + socialINF + KMs)

                localInput.append(
                    (pid, time, PATS, timescore, socialINF, category))

                if pid not in POIScoreDict:
                    POIScoreDict[pid] = PATS + timescore + socialINF + KMs
                    POIDataDict[pid] = (pid, time, PATS, timescore, socialINF,
                                        category)
                else:
                    if (PATS + timescore + socialINF +
                            KMs) > POIScoreDict[pid]:
                        POIDataDict[pid] = (pid, time, PATS, timescore,
                                            socialINF, category)
            else:
                continue

        scoreD[c] = score
        c += 1
        qCount += 1
        reconstructionInput.append(tuple(localInput))
    timelist.append('POI:' + str(timer.time() - startTime))
    startTime = timer.time()
    sorted_scoreD = sorted(scoreD.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    limit = pro * len(sorted_scoreD)
    chosedInput = []
    reconstructionInput_p = []
    orignalRouteScore = {}

    for i in xrange(int(limit)):
        chosedInput.append(sorted_scoreD[i][0])

    for i in chosedInput:

        splitIntoHeadDict(reconstructionInput[i])
        t = []
        score = 0.0
        for x in reconstructionInput[i]:
            t.append(x[0])
            score += (x[2] + x[3] + x[4])
        orignalRouteScore[tuple(t)] = score

    reconstructionInput_p = Set(reconstructionInput_p)
    prefixSet = Set()

    for i in splitDict:
        tempStack = []
        construct_c = 0
        construct(splitDict[i])
    timelist.append('Construct:' + str(timer.time() - startTime))
    startTime = timer.time()
    routeList = []
    result = []
    qCoverTime = 0.0
    ScoringTime = 0.0
    ProcessTime = 0.0
    qCount = 0
    for row_r in qByRegion_rows:
        # break if too much result
        if qCount > 10000:
            break
        process_start_time = timer.time()
        select_rid = row_r[1]

        if orignal_rid == select_rid:
            continue

        cover = 0
        POIs = eval(row_r[0])
        if len(POIs) <= POILength:
            continue

        rScore = 0.0
        recommendCategory = Set([])
        socialFlag = False
        POICount = len(POIs)
        recommendPOI = Set([])
        scoring_start_time = timer.time()

        Total_PATS = 0.0
        Total_Timescore = 0.0
        Total_SocialINF = 0.0
        Total_KM = 0.0

        AllPOI = []
        for POI in POIs:
            pid = POI['pid']
            PATS = POI['PATS']
            timescore = POI['timeScore']
            time = int(
                datetime.datetime.fromtimestamp(float(
                    POI['time'])).strftime('%H')) + 8
            if time > 24:
                time = time - 24

            socialINF = 0.0

            latitude = POIDict[pid]['latitude']
            longitude = POIDict[pid]['longitude']

            if latitude >= minlat and latitude <= maxlat and longitude >= minlong and longitude <= maxlong:
                visiters = Set(POIDict[pid]['visiters'])
                # select social influnce score
                for v in visiters:
                    if str(v) not in fids:
                        continue
                    new_u = FBsInfIdxDict[uid]
                    new_f = FBsInfIdxDict[v]
                    scores = FBsInfMatrix[new_u][new_f]
                    socialFlag = True
                    socialINF += float(scores)

                Total_PATS += PATS
                Total_Timescore += timescore
                Total_SocialINF += socialINF
                for keyword in keywords:
                    Total_KM += KM(keyword, pid)

                pScore = PATS + timescore + socialINF
                rScore += pScore

            name = POIDict[pid]['name']
            category = POIDict[pid]['category']
            link = POIDict[pid]['link']
            likes = POIDict[pid]['likes']
            checkins = POIDict[pid]['checkins']
            POITuple = {
                'pid': pid,
                'time': time,
                'coor': [latitude, longitude],
                'name': name,
                'category': category,
                'link': link,
                'likes': likes,
                'checkins': checkins
            }
            # POITuple = {'pid': pid, 'time': time , 'coor': [latitude, longitude], 'name': name}
            AllPOI.append(POITuple)

        avg_rScore = float(rScore) / POICount

        poi_hitCount = 0
        poiHit = 0
        editdistance = 0
        hitCount = 0
        categoryHit = 0
        ScoringTime = 0
        ProcessTime = 0

        reconstructionFlag = False
        tup = uid, orignal_rid, select_rid, rScore, avg_rScore, categoryHit, cover, poi_hitCount, poiHit, socialFlag, ScoringTime, ProcessTime, Total_PATS, Total_Timescore, Total_SocialINF, float(
            Total_PATS) / POICount, float(Total_Timescore) / POICount, float(
                Total_SocialINF
            ) / POICount, editdistance, reconstructionFlag, AllPOI, Total_KM
        result.append(tup)
        qCount += 1

    result += reconstructionOutput
    timelist.append('Scoring:' + str(timer.time() - startTime))
    startTime = timer.time()
    result = [d for d in result if d is not None]
    # pool_size = 8  # your "parallelness"

    # _pool = Pool(pool_size)
    # xxx = _pool.map(cal,qByRegion_rows)

    #remove subsequence
    # POISequence = []
    # for i in result:
    #     if i is not None:
    #         POIs = i[20]
    #         Seq = ''
    #         for POI in POIs:
    #             Seq += POI['pid']
    #             Seq += ','
    #         POISequence.append(Seq)

    # seqIdx = 0
    # for seq in POISequence:
    #     for seqq in POISequence:
    #         if seq == seqq:
    #             continue
    #         if seq in seqq:
    #             result[seqIdx] = None
    #             break
    #     seqIdx += 1

    # timelist.append('Remove subseq:'+str(timer.time()-startTime))
    # startTime = timer.time()
    result_new = []
    #time constraint here!!!
    if stime == 'Anytime' and etime == 'Anytime':
        for i in result:
            if i is not None:
                POIs = i[20]
                nTuple = i + (POIs, )
                result_new.append(nTuple)
        pass
    elif stime == 'Anytime':
        endTime = int(etime.replace(':00', ''))
        for i in result:
            if i is not None:
                POIs = i[20]
                newPOIs = []
                for POI in POIs:
                    time = POI['time']
                    if int(time) <= endTime:
                        newPOIs.append(POI)
                if len(newPOIs) > 1:
                    nTuple = i + (newPOIs, )
                    result_new.append(nTuple)
    elif etime == 'Anytime':
        startTime = int(stime.replace(':00', ''))
        for i in result:
            if i is not None:
                POIs = i[20]
                newPOIs = []
                for POI in POIs:
                    time = POI['time']
                    if int(time) >= startTime:
                        newPOIs.append(POI)
                if len(newPOIs) > 1:
                    nTuple = i + (newPOIs, )
                    result_new.append(nTuple)
    else:
        startTime = int(stime.replace(':00', ''))
        endTime = int(etime.replace(':00', ''))
        for i in result:
            if i is not None:
                POIs = i[20]
                newPOIs = []
                for POI in POIs:
                    time = POI['time']
                    if int(time) >= startTime and int(time) <= endTime:
                        newPOIs.append(POI)
                if len(newPOIs) > 1:
                    nTuple = i + (newPOIs, )
                    result_new.append(nTuple)
    timelist.append('Time:' + str(timer.time() - startTime))
    startTime = timer.time()

    skylineInputDict = {}
    skylineInputValue = []
    skylineInputDict_avg = {}
    skylineInputValue_avg = []
    idxCount = 0
    for t in result_new:
        rid = t[2]
        ScoringTime += t[10]
        ProcessTime += t[11]
        # skylineT = (t[12], t[13], t[14])
        skylineT_avg = (t[15], t[16], t[17], t[21])
        # skylineInputDict[(rid, skylineT)] = idxCount
        skylineInputDict_avg[(rid, skylineT_avg)] = idxCount
        # skylineInputValue.append((rid, skylineT))
        skylineInputValue_avg.append((rid, skylineT_avg))
        idxCount += 1

    # _pool = Pool(8)
    r = _pool.map(
        cal_dominate,
        itertools.izip(skylineInputValue_avg,
                       itertools.repeat(skylineInputValue_avg)))
    # r = [i for i in r if i is not None]
    resultData_avg = [
        result_new[skylineInputDict_avg[i]] for i in r if i is not None
    ]
    # pool_skyline.close()
    # pool_skyline.join()
    # r = []

    # for i in skylineInputValue_avg:
    #     all_dominate = True
    #     for j in skylineInputValue_avg:
    #         if if_dominate(i[1], j[1]) == False:
    #             all_dominate = False
    #             break
    #     if all_dominate:
    #         r.append(i)

    # resultData_avg = [result_new[skylineInputDict_avg[i]] for i in r]
    timelist.append('Skyline:' + str(timer.time() - startTime))
    startTime = timer.time()

    startTime = timer.time()
    #sorting by PATS
    sorted_by_PATS = sorted(result_new, reverse=True,
                            key=lambda tup: tup[12])[:len(resultData_avg)]
    #sorting by timescore
    sorted_by_timescore = sorted(result_new,
                                 reverse=True,
                                 key=lambda tup: tup[13])[:len(resultData_avg)]
    #sorting by socialINF
    sorted_by_socialINF = sorted(result_new,
                                 reverse=True,
                                 key=lambda tup: tup[14])[:len(resultData_avg)]
    #sorting by KM
    sorted_by_KM = sorted(result_new, reverse=True,
                          key=lambda tup: tup[21])[:len(resultData_avg)]

    resultPOI_skyline = [i[22] for i in resultData_avg]
    resultPOI_skyline_ori = [
        i[22] for i in resultData_avg if 'Reconstruct_' not in str(i[2])
    ]
    resultPOI_skyline_re = [
        i[22] for i in resultData_avg if 'Reconstruct_' in str(i[2])
    ]
    resultPOI_PATS = [i[22] for i in sorted_by_PATS]
    resultPOI_timescore = [i[22] for i in sorted_by_timescore]
    resultPOI_socialINF = [i[22] for i in sorted_by_socialINF]
    resultPOI_KM = [i[22] for i in sorted_by_KM]
    timelist.append('Sort:' + str(timer.time() - startTime))
    startTime = timer.time()
    return resultPOI_skyline, resultPOI_PATS, resultPOI_timescore, resultPOI_socialINF, resultPOI_skyline_ori, resultPOI_skyline_re, resultPOI_KM
def compareIndividualsNodeWise(truthList, testList, model1s, model2s, covs,
                               equivs):

    modeler = model1s[0]
    SDs = [0. for q in truthList]
    nodeSDs = []
    edgeSens, inDegrees, edgePPVs = [], [], []
    inCoV = []
    TPsum, TNsum, FPsum, FNsum = 0, 0, 0, 0
    for node in range(0, len(modeler.nodeList)):
        tempSD = 0.
        FP, TP, TN, FN = 0, 0, 0, 0
        # simplify rules at the node and find the edge-wise PPV, sens, and SDs
        inCovTemper = []
        for k in range(len(truthList)):
            inCovtemp = []
            # find start and end of this node in each model
            start1, end1 = findEnds(model1s[k], node, truthList[k])
            start2, end2 = findEnds(model2s[k], node, testList[k])

            # find the shadow and nodes for each model
            truthInEdges = findInEdges(model1s[k], node)
            testInEdges = findInEdges(model2s[k], node)

            # find the bitstring for just this node
            truth = truthList[k][start1:end1]
            test = testList[k][start2:end2]

            # simplify ground truth and recovered rules
            truth = simplifyRule(truth, truthInEdges)
            test = simplifyRule(test, testInEdges)

            # edit overall rule list with simplified rules
            testList[k][start2:end2] = test
            truthList[k][start1:end1] = truth

            # find SD, PPV, etc....
            truthSet = Set([])  # edges in correct rule
            testSet = Set([])  # edges in rule found
            baseSet = Set([])  # edges possible across all rules

            # find edges in true rule (and edges possible), average incoming coefficient of variation
            for i in range(0, len(truth)):
                if truth[i] == 1:
                    for nodeToAdd in model1s[k].andNodeList[node][i]:
                        truthSet.add(nodeToAdd)
                        inCovtemp.append(covs[k][node])
                for nodeToAdd in model1s[k].andNodeList[node][i]:
                    baseSet.add(nodeToAdd)
            # find edges in test (recovered) rule
            for i in range(0, len(test)):
                if test[i] == 1:
                    for nodeToAdd in model2s[k].andNodeList[node][i]:
                        testSet.add(nodeToAdd)
            # find structural distance at this node.
            SDs[k] = SDs[k] + len(truthSet.difference(testSet)) + len(
                testSet.difference(truthSet))
            tempSD = tempSD + len(truthSet.difference(testSet)) + len(
                testSet.difference(truthSet))
            # save edge-wise statistics for this node
            FP += 1. * len(testSet.difference(truthSet))
            TP += 1. * len(testSet.intersection(truthSet))
            FN += 1. * len(truthSet.difference(testSet))
            inCovTemper.append(numpy.mean(inCovtemp))
        # calculae and save overall edge-wise statistics
        if (TP + FN) > 0:
            sensitivity = 1. * TP / (TP + FN)
        else:
            sensitivity = 100
        if TP + FP > 0:
            PPV = 1. * TP / (TP + FP)
        else:
            PPV = 100
        nodeSDs.append(tempSD / len(truthList))
        edgeSens.append(sensitivity)
        edgePPVs.append(PPV)
        TPsum += TP
        FNsum += FN
        FPsum += FP
        inDegrees.append(len(baseSet))
        inCoV.append(numpy.mean(inCovTemper))
    if (TPsum + FNsum) > 0:
        edgeSens = 1. * TPsum / (TPsum + FNsum)
    else:
        edgeSens = 100
    if (FPsum + TPsum) > 0:
        edgePPV = 1. * TPsum / (FPsum + TPsum)
    else:
        edgePPV = 100

    nodeSens = []  # sensitivity by node
    nodePPV = []  # PPV by node
    nodeRTs = []  # rules true by node
    nodePsens = []
    nodepPPV = []
    nodelister = model1s[
        0].nodeList  # gives a node List (should be the same across all trials in a network...)
    sampleRTs = [[] for item in truthList]  # Rules True for each trial
    samplePPVs = [[] for item in truthList]  # PPV for each trial
    sampleSenss = [[] for item in truthList]  # Sens for each trial
    equivRTsens = [[] for item in truthList
                   ]  # RT sensitivity of equivalents for each trial
    equivSens = [[] for item in truthList
                 ]  # sensitivity for equivalents for each trial
    equivNodeRTsens = []
    equivNodeSens = []

    # iterate over all nodes in the network
    for node in range(len(nodelister)):
        rtTemp = []  # stores rules true for this node across all networks
        ones = []  # stores the number of false negative and rules
        zeros = []  # stores the number of  correct and rules
        negones = []  # stores the number of false positive and rules
        equivOnes = [
        ]  # stores the min number of false negatives across equivs
        equivZeros = []  # stores the max correct across equivs
        equivNegOnes = []  # stores the min false positives across equivs
        sumindividual = []  # total number true positive and rules
        equivRTsensNode = []
        equivSensNode = []

        #loop over individuals provided and calculate sens, PPV, rules true
        for i in range(len(truthList)):

            # find start and end of this node in each model
            start1, end1 = findEnds(model1s[i], node, truthList[i])
            start2, end2 = findEnds(model2s[i], node, testList[i])

            # find the values for just this node
            truth = truthList[i][start1:end1]
            test = testList[i][start2:end2]

            # set up empty lists for ands, edges, and the shadow and nodes associated with this node in each model
            truthAnds = []
            testAnds = []

            # get the set of all shadow and nodes that are actually used in each rule
            for j in range(len(model1s[i].andNodeList[node])):
                if truth[j] > 0:
                    truthAnds.append(tuple(model1s[i].andNodeList[node][j]))
            for j in range(len(model2s[i].andNodeList[node])):
                if test[j] > 0:
                    testAnds.append(tuple(model2s[i].andNodeList[node][j]))
            truthAnd = tuple(truthAnds)
            truthAnd = set(truthAnd)
            testAnd = set(tuple(testAnds))

            # get the set of all shadow and nodes used in each equivalent rule
            equivAnds = []
            # print(equivs[i])
            for test1 in equivs[i][node]:
                tempEquivAnd = []
                for j in range(len(model2s[i].andNodeList[node])):
                    if test1[j] > 0:
                        tempEquivAnd.append(
                            tuple(model2s[i].andNodeList[node][j]))
                testAnd1 = set(tuple(tempEquivAnd))
                equivAnds.append(testAnd1)
            RTequiv = 0.
            possibilityOnes = []
            possibilityZeros = []
            possibilityZNetones = []
            for testAnder1 in equivAnds:
                if (truthAnd == testAnder1):
                    RTequiv = 1.
                possibilityOnes.append(len(truthAnd.difference(testAnd)))
                possibilityZeros.append(len(truthAnd.intersection(testAnd)))
                possibilityZNetones.append(len(testAnd.difference(truthAnd)))
            # append results for this trial to all results
            maxpossibilityZeros = max(possibilityZeros)
            minpossiblityOnes = min(possibilityOnes)
            minpossibilityNegOnes = min(possibilityZNetones)
            equivOnes.append(minpossiblityOnes)
            equivZeros.append(maxpossibilityZeros)
            equivNegOnes.append(minpossibilityNegOnes)
            equivRTsensNode.append(RTequiv)
            equivRTsens[i].append(RTequiv)
            # calculate true positives, false positives, false negatives, and total slots for this node, trial and save
            onetemp = len(truthAnd.difference(testAnd))
            zerotemp = len(truthAnd.intersection(testAnd))
            negonetemp = len(testAnd.difference(truthAnd))
            sumindtemp = len(truthAnd)
            ones.append(onetemp)
            zeros.append(zerotemp)
            negones.append(negonetemp)
            sumindividual.append(sumindtemp)
            # add Rules true first sample-wise then node-wise
            if len(model1s[i].andNodeList[node]) > 1:
                if (truthAnd == testAnd):
                    sampleRTs[i].append(1.)
                else:
                    sampleRTs[i].append(0.)
                if (sumindtemp - onetemp + negonetemp) > 0:
                    samplePPVs[i].append(1. * (sumindtemp - onetemp) /
                                         (sumindtemp - onetemp + negonetemp))
                else:
                    samplePPVs[i].append(100)
                if (sumindividual[i]) > 0:
                    sampleSenss[i].append(1. * (sumindtemp - onetemp) /
                                          (sumindtemp))
                else:
                    sampleSenss[i].append(100)
            if (truthAnd == testAnd):
                rtTemp.append(1.)
            else:
                rtTemp.append(0.)

        nodeRTs.append(numpy.mean(rtTemp))  # node-wise Rules true added
        equivNodeRTsens.append(numpy.mean(equivRTsensNode))

        # calculate sensitivity for the node
        temp = [
            100 if sumindividual[i] == 0 else 1. *
            (sumindividual[i] - ones[i]) / (sumindividual[i])
            for i in range(0, len(ones))
        ]
        temp = filter(lambda a: a != 100, temp)
        if len(temp) == 0:
            sensitivity = 100
        else:
            sensitivity = (1. * numpy.sum(temp) / len(temp))

        # calculate max sensitivity for the node
        temp = [
            100 if sumindividual[i] == 0 else 1. *
            (sumindividual[i] - equivOnes[i]) / (sumindividual[i])
            for i in range(0, len(equivOnes))
        ]
        temp = filter(lambda a: a != 100, temp)
        if len(temp) == 0:
            psensitivity = 100
        else:
            psensitivity = (1. * numpy.sum(temp) / len(temp))
        nodePsens.append(psensitivity)

        # calculate PPV for the node
        temp = [
            100 if (sumindividual[i] - ones[i] +
                    negones[i]) == 0 else 1. * (sumindividual[i] - ones[i]) /
            (sumindividual[i] - ones[i] + negones[i])
            for i in range(0, len(ones))
        ]
        temp = filter(lambda a: a != 100, temp)
        if len(temp) == 0:
            PPV = 100
        else:
            PPV = (1. * numpy.sum(temp) / len(temp))

        # calculate PPV for the node
        temp = [
            100 if
            (sumindividual[i] - equivOnes[i] + equivNegOnes[i]) == 0 else 1. *
            (sumindividual[i] - equivOnes[i]) /
            (sumindividual[i] - equivOnes[i] + equivNegOnes[i])
            for i in range(0, len(equivOnes))
        ]
        temp = filter(lambda a: a != 100, temp)
        if len(temp) == 0:
            pPPV = 100
        else:
            pPPV = (1. * numpy.sum(temp) / len(temp))
        nodepPPV.append(pPPV)

        # add to list of sensitivity and PPV by
        nodeSens.append(sensitivity)
        nodePPV.append(PPV)
    sampleEquivRT = [
        1. * numpy.mean(filter(lambda a: a != 100, sampler))
        for sampler in equivRTsens
    ]  # Rules True for each trial
    sampleRT = [
        1. * numpy.mean(filter(lambda a: a != 100, sampler))
        for sampler in sampleRTs
    ]  # Rules True for each trial
    samplePPV = [
        1. * numpy.mean(filter(lambda a: a != 100, sampler))
        for sampler in samplePPVs
    ]  # PPV for each trial
    sampleSens = [
        1. * numpy.mean(filter(lambda a: a != 100, sampler))
        for sampler in sampleSenss
    ]  # Sens for each trial
    return sampleEquivRT, equivNodeRTsens, nodePsens, nodepPPV, sampleSens, samplePPV, nodeSens, nodePPV, sampleRT, nodeRTs, edgeSens, edgePPV, SDs, nodeSDs, len(
        modeler.nodeList), inDegrees, inCoV
Пример #5
0
    def write_data_matrix(self,
                          data_matrix,
                          output_fname,
                          strain_id_list,
                          snp_id_list,
                          snp_id2acc,
                          with_header_line,
                          nt_alphabet,
                          strain_id2acc=None,
                          strain_id2category=None,
                          rows_to_be_tossed_out=Set(),
                          strain_id2other_info=None,
                          discard_all_NA_strain=0,
                          predefined_header_row=[
                              'strain', 'duplicate', 'latitude', 'longitude',
                              'nativename', 'stockparent', 'site', 'country'
                          ]):
        """
		2008-05-08
			defunct use write_data_matrix from pymodule
		2007-02-19
			if strain_id2acc is available, translate strain_id into strain_acc,
			if strain_id2category is available, add 'category'
		2007-02-25
			if one strain's SNP row is all NA, it'll be skipped
		2007-02-25
			add argument rows_to_be_tossed_out
		2007-09-23
			add discard_all_NA_strain
		2007-10-22
			add no_of_all_NA_rows
		2007-12-13
			add predefined_header_row
		2007-12-16
			add 'duplicate' into predefined_header_row
		"""
        sys.stderr.write("Writing data_matrix ...")
        no_of_all_NA_rows = 0
        writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
        if with_header_line:
            header_row = [predefined_header_row[0]]
            if strain_id2category:
                header_row.append(predefined_header_row[1])
            if strain_id2other_info:
                no_of_fields = len(
                    strain_id2other_info.values()[0])  #2007-12-13
                for i in range(no_of_fields):
                    header_row.append(predefined_header_row[2 + i])
            for snp_id in snp_id_list:
                header_row.append(snp_id2acc[snp_id])
            writer.writerow(header_row)
        for i in range(len(data_matrix)):
            if strain_id2acc:
                new_row = [strain_id2acc[strain_id_list[i]]]
            else:
                new_row = [strain_id_list[i]]
            if strain_id2category:
                new_row.append(strain_id2category[strain_id_list[i]])
            if strain_id2other_info:
                new_row += strain_id2other_info[strain_id_list[i]]
            if discard_all_NA_strain and sum(
                    data_matrix[i] == 0) == data_matrix.shape[1]:
                no_of_all_NA_rows += 1
                continue
            elif i not in rows_to_be_tossed_out:  #2007-02-25
                for j in data_matrix[i]:
                    if nt_alphabet:
                        j = number2nt[j]
                    new_row.append(j)
                writer.writerow(new_row)
        del writer
        sys.stderr.write("%s all NA rows ." % no_of_all_NA_rows)
        sys.stderr.write("Done.\n")
Пример #6
0
        self.href = Href(str(item.uri))
        subject = str(model.get_target(item, dc.subject))
        self.tags = Set([])
        if subject is not None:
            self.tags = Set([Tag(x) for x in subject.split(" ")])
        if tag is not None:
            self.tags.add(tag)


if __name__ == '__main__':
    import sys
    from sets import Set

    username = sys.argv[1]
    user = User(username)
    tags = Set()
    users = {}

    print("Reading " + username + " posts...")
    for post in user:
        for tag in post.tags:
            tags.add(tag)

        other_tags = Set()
        count = 0
        for other_post in post.href:
            u = other_post.user
            if not u == user:
                count += 1
                if u not in users:
                    users[u] = []
Пример #7
0
import re
from json_utils import load_json
import operator
import os
import pprint
from sets import Set
from itertools import islice

def take(n, iterable):
    return list(islice(iterable, n))

allowed_chars = Set('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_')

master_count = {}

def add_hit(channel, hit_type, hit):
    if not master_count.get(channel):
        master_count[channel] = {
            'emojis': {},
            'emojis_reactions': {},
        }

    if not master_count.get(channel).get(hit_type).get(hit):
        master_count[channel][hit_type][hit] = 0

    master_count[channel][hit_type][hit] += 1


def filter_emojis(text):
    return Set(text.replace(':', '')).issubset(allowed_chars) and len(text) > 2
Пример #8
0
def puzzle(grid_size, old_grid_size, nr_words):
    print "NR WORDS", nr_words
    iter = 0
    png_nr = 0
    # log_file = open(output_directory + r"\log.txt", "w")
    #sample words to follow
    follow_inds = Set()
    while len(follow_inds) < nr_words_to_follow:
        follow_inds.add(random.randrange(nr_words))
    follow_inds = list(follow_inds)

    # log_file.write("Indexes:")
    # for key, value in global_index.iteritems():
    # log_file.write(str(key)+" " + value.name + " " + str(value.id) + "\n")

    print "\n========\nSTART PUZZLING\n===========\n"
    log_file = open(log_file_n, 'a')
    log_file.write("\n\n========\nSTART PUZZLING\n===========\n\n")
    log_file.close()
    trial_nr = 0
    nr_inits = 0
    elem_indexes = range(nr_words)
    grid_size = int(grid_size)
    while not stop_condition(trial_nr):
        for i in range(nr_trials_check):
            if trial_nr % 5 == 0:
                print "\nTRIAL", trial_nr, datetime.datetime.now()
                log_file = open(log_file_n, 'a')
                log_file.write("TRIAL " + str(trial_nr) + " " +
                               str(datetime.datetime.now()) + "\n")
                log_file.close()
            # check if you need to reinitialize
            if iter % (nr_words * nr_trials_re_init) == 0 or iter == 0:
                nr_inits += 1
                print "\ninit closest trial", trial_nr, "     start:", datetime.datetime.now(
                )
                log_file = open(log_file_n, 'a')
                log_file.write("init closest at " +
                               str(datetime.datetime.now()) + "\n")
                log_file.close()
                init_closest(grid_size, old_grid_size, iter == 0)
                print "init closest stop:", datetime.datetime.now()
                log_file = open(log_file_n, 'a')
                log_file.write("stop init closest at " +
                               str(datetime.datetime.now()) + "\n")
                log_file.close()
                # log_file.write("INITIALIZED\n\n")
                # print_all_lists(str(trial_nr))
            if iter == 0:
                stats_to_file("FIRST", trial_nr, follow_inds, nr_inits,
                              grid_size, png_nr)
                png_nr += 1
            # pick random element
            random.shuffle(elem_indexes)
            for elem_i in elem_indexes:
                [x, y] = list(global_index[elem_i].pos)

                if iter % 5000 == 0:
                    print "iter", iter,

                swap_value = float("-inf")
                # check with which neighbor it wants to swap
                for dx in range(neighbor_range_swap[0],
                                neighbor_range_swap[1]):
                    for dy in range(neighbor_range_swap[0],
                                    neighbor_range_swap[1]):
                        # print x, dx, y, dy
                        if x + dx >= 0 and x + dx < grid_size and y + dy >= 0 and y + dy < grid_size:
                            # check grid elem != none
                            # print "in check"
                            if grid_f[x + dx][y + dy] != None:
                                v = grid_f[x][y].get_improvement(
                                    x + dx, y + dy) + grid_f[x + dx][
                                        y + dy].get_improvement(x, y)
                            else:
                                v = grid_f[x][y].get_improvement(
                                    x + dx, y + dy)
                            if v > swap_value:
                                # process swap value
                                swap_value = v
                                swap_x = x + dx
                                swap_y = y + dy

                if swap_value > 0:
                    xy = grid_f[x][y]
                    xy_swap = grid_f[swap_x][swap_y]
                    grid_f[x][y] = xy_swap
                    grid_f[swap_x][swap_y] = xy
                    xy.change_pos(swap_x, swap_y)
                    if xy_swap != None:
                        xy_swap.change_pos(x, y)
                elif swap_value == float("-inf"):
                    print "-inf"

                iter += 1

            # figures and stats to file
            if trial_nr % to_file_trials == 0 and trial_nr != 0:
                stats_to_file(iter, trial_nr, follow_inds, nr_inits, grid_size,
                              png_nr)
                png_nr += 1
            trial_nr += 1
    stats_to_file("LAST", trial_nr, follow_inds, nr_inits, grid_size, png_nr)
Пример #9
0
            hg.interval_list([i[0]]).intersection(amplist) +
            hg.interval_list([i[0]]).intersection(rdList)) > 0
    ])
    rdList = hg.interval_list([
        hg.interval(i.chrom, max(0, i.start - 10000),
                    min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)]))
        for i in rdList
    ])
    iout = open(outName + '.integration_search.out', 'w')
    iout.write(mystdout.getvalue())
    iout.close()
    sys.stdout = old_stdout

irdhops = []
irddict = {}
irdSets = Set([Set([ird]) for ird in rdList])
irdgroupdict = {ird: Set([ird]) for ird in rdList}
if args.extendmode == 'EXPLORE' or args.extendmode == 'VIRAL':
    for ird in rdList:
        logging.info("#TIME " + '%.3f\t' % (time() - TSTART) +
                     "Exploring interval: " + str(ird))
        old_stdout = sys.stdout
        sys.stdout = mystdout = StringIO()
        ilist = bamFileb2b.interval_hops(ird)
        irdhops.append((ird, ilist))
        for i in ilist:
            irddict[i] = ird
        iout = open(
            outName + '.' + ird.chrom + ":" + str(ird.start) + '-' +
            str(ird.end) + '.out', 'w')
        iout.write(mystdout.getvalue())
Пример #10
0
def check_one_file(a, d1, dx1, FS, threshold, file_input, view_strings=False, new=True, library=True):
    d2 = None
    ret_type = androconf.is_android( file_input )
    if ret_type == "APK":
        a = apk.APK( file_input )
        d2 = dvm.DalvikVMFormat( a.get_dex() )
    elif ret_type == "DEX":
        d2 = dvm.DalvikVMFormat( read(file_input) )

    if d2 == None:
      return
    dx2 = analysis.VMAnalysis( d2 )

    el = elsim.Elsim( ProxyDalvik(d1, dx1), ProxyDalvik(d2, dx2), FS, threshold, options.compressor, libnative=library )
    el.show()
    print "\t--> methods: %f%% of similarities" % el.get_similarity_value(new)


    if options.dump:
        print '\nDumping smali code...'
        tmp1 = options.input[1].split('/')
        jarname = tmp1[len(tmp1)-1]
        if not os.path.exists('smali'):
            os.makedirs('smali')
        os.system('apktool d ' + options.input[1])
        if jarname[len(jarname)-4:len(jarname)] == '.apk':
            os.system('mv -f ' + jarname[0:len(jarname)-4] + ' smali')
        else:
            os.system('mv -f ' + jarname + '.out ' + 'smali')


        classes = Set([])
        diff_methods = el.get_similar_elements()
        for i in diff_methods:
            x = el.show_similar_class_name( i )
            for j in range(0, len(x)):
                classes.add(x.pop())

        new_methods = el.get_new_elements()
        for i in new_methods:
            y = el.show_new_class_name( i )
            classes.add(y)

        if not os.path.exists('codedump'):
            os.makedirs('codedump')
        os.chdir('codedump')

        if os.path.exists(jarname):
            os.system('rm -rf ' + jarname)
        os.makedirs(jarname)
        os.chdir('..')
        for i in range(0,len(classes)):
            #os.makedirs('codedump/' + jarname)
            filepath = classes.pop()
            filename = filepath.replace('/','.')
            shutil.copy2('smali/' + jarname + '.out/smali/' + filepath, 'codedump/' + jarname + '/' + filename)
        os.system('rmdir codedump/' + jarname)



        classes1 = Set([])
        for i in diff_methods:
            x = el.show_similar_method_name( i )
            for j in range(0, len(x)):
                classes1.add(x.pop())
        for i in new_methods:
            y = el.show_new_method_name( i )
            classes1.add(y)
        start = ''
        end = '.end method'
        if not os.path.exists('methoddump'):
            os.makedirs('methoddump')
        
        for i in range(0,len(classes1)):
            x1 = classes1.pop()
            xx = x1.split(' ', 1)
            if not os.path.exists('methoddump/' + jarname):
                os.makedirs('methoddump/' + jarname)
            with open('codedump/' + jarname + '/' + xx[0]) as infile:
                for line in infile:
                    if xx[1] in line:
                        start = line.replace('\n','')
                        break
            med = xx[1].split('(', 1)[0]
            with open('codedump/' + jarname + '/' + xx[0]) as infile, open('methoddump/' + jarname + '/' + xx[0] + '.' + med + '.method', 'w+') as outfile:
                copy = False
                outfile.write(start + '\n')
                for line1 in infile:                    
                    if line1.strip() == start:
                        copy = True
                    elif line1.strip() == end:
                        copy = False
                    elif copy:
                        outfile.write(line1)
                outfile.write(end)






        print 'DUMP SMALI CODE SUCCESSFULLY.'


    if options.display:
        print "SIMILAR methods:"
        diff_methods = el.get_similar_elements()
        for i in diff_methods:
            el.show_element( i )

        print "IDENTICAL methods:"
        new_methods = el.get_identical_elements()
        for i in new_methods:
            el.show_element( i )

        print "NEW methods:"
        new_methods = el.get_new_elements()
        for i in new_methods:
            el.show_element( i, False )

        print "DELETED methods:"
        del_methods = el.get_deleted_elements()
        for i in del_methods:
            el.show_element( i )

        print "SKIPPED methods:"
        skipped_methods = el.get_skipped_elements()
        for i in skipped_methods:
            el.show_element( i )

    if view_strings:
        els = elsim.Elsim( ProxyDalvikStringMultiple(d1, dx1),
                           ProxyDalvikStringMultiple(d2, dx2),
                           FILTERS_DALVIK_SIM_STRING,
                           threshold,
                           options.compressor,
                           libnative=library )
        #els = elsim.Elsim( ProxyDalvikStringOne(d1, dx1),
        #    ProxyDalvikStringOne(d2, dx2), FILTERS_DALVIK_SIM_STRING, threshold, options.compressor, libnative=library )
        els.show()
        print "\t--> strings: %f%% of similarities" % els.get_similarity_value(new)

        if options.display:
          print "SIMILAR strings:"
          diff_strings = els.get_similar_elements()
          for i in diff_strings:
            els.show_element( i )

          print "IDENTICAL strings:"
          new_strings = els.get_identical_elements()
          for i in new_strings:
            els.show_element( i )

          print "NEW strings:"
          new_strings = els.get_new_elements()
          for i in new_strings:
            els.show_element( i, False )

          print "DELETED strings:"
          del_strings = els.get_deleted_elements()
          for i in del_strings:
            els.show_element( i )

          print "SKIPPED strings:"
          skipped_strings = els.get_skipped_elements()
          for i in skipped_strings:
            els.show_element( i )
Пример #11
0
from math import sqrt as sqrt


def sod(n):
    sum = 1
    for i in range(2, int(sqrt(n))):
        if (n % i == 0):
            sum += i
            sum += n / i
    sq = int(sqrt(n))
    if (n % sq == 0):
        sum += sq
    return sum


l = [0 for i in range(10000)]
ans = []
for i in range(1, 10000):
    l[i] = sod(i)
    if (l[i] < i):
        if (l[l[i]] == i):
            ans.append(i)
            ans.append(l[i])
from sets import Set
ans = Set(ans)
print ans
x = sum(ans)
print x
Пример #12
0
 def get_years(self):
     years = Set()
     incidents = Incident.objects.all()
     for inc in incidents:
         years.add(inc.year)
     return years
Пример #13
0
def deaccent(s):
    return s \
      .replace(u'ά', u'α') \
      .replace(u'έ', u'ε') \
      .replace(u'ή', u'η') \
      .replace(u'ί', u'ι') \
      .replace(u'ό', u'ο') \
      .replace(u'ύ', u'υ') \
      .replace(u'ώ', u'ω') \
      .replace(u'ς', u'σ')


import os
crawlerdir = os.environ['CRAWLERDIR']

expletives = Set()
with open(crawlerdir + "greekdata/expletives", "r") as f:
    for line in f:
        expletives.add(deaccent(unicode(line, 'utf-8').strip().lower()))
articles = Set()
with open(crawlerdir + "greekdata/articles", "r") as f:
    for line in f:
        articles.add(deaccent(unicode(line, 'utf-8').strip().lower()))
pronouns = Set()
with open(crawlerdir + "greekdata/pronouns", "r") as f:
    for line in f:
        pronouns.add(deaccent(unicode(line, 'utf-8').strip().lower()))
locations = Set()
with open(crawlerdir + "greekdata/locations", "r") as f:
    for line in f:
        locations.add(deaccent(unicode(line, 'utf-8').strip().lower()))
def feat_extr_ngram(row_id_str, hdfs_dir_list, hdfs_feat_dir, model_data_folder
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name 
    , mongo_tuples, fromweb, label_arr, metadata_count,label_idx,data_idx, pattern_str, ln_delimitor, data_field_list, jkey_dict
    , jobname, num_gram, feature_count_threshold, token_dict=None, HDFS_RETR_DIR=None, remove_duplicated="N"
    , cust_featuring=None, cust_featuring_params=None, local_out_dir=None, filter_ratio=None
    ): 

    # zip func in other files for Spark workers ================= ================
    zip_file_path=ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, user_custom=cust_featuring)
    # get_spark_context
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 
    # log time ================================================================ ================
    t0 = time()

    # input filename
    input_filename="*"
    ext_type='.gz'
    gz_list=None
    convert2dirty="N"
    if not ',' in hdfs_dir_list: # single dir having *.gz ==== =========
        # read raw data from HDFS as .gz format ========== 
        rdd_files=os.path.join(hdfs_dir_list, input_filename+ext_type)
        # check if gz files in hdfs ============
        try:
            gz_list=hdfs.ls(hdfs_dir_list)
            print "INFO: check hdfs folder=",hdfs_dir_list

        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Error at checking HDFS file:", sys.exc_info()[0]     
        # use whole folder
        if gz_list is None or len(gz_list)==0:
            rdd_files=hdfs_dir_list
            print "ERROR: No file found by ",input_filename+ext_type #,", use",hdfs_dir_list,"instead"    
            return -2
    else: # multiple dirs ==== =========
        rdd_files=""
        cnt=0
        temp_lbl_list=[]
        comma=""
        print "INFO: before label_arr=",label_arr
        
        # check each folder
        for dr in hdfs_dir_list.split(','):
            print "****=",dr
            if not len(dr)>0:
                continue
            try:
                # remove space etc.
                dr=dr.strip()
                fdr=os.path.join(HDFS_RETR_DIR, dr)
                print "fdr=",fdr
                # ls didn't like "*"
                if '*' in fdr:
                    #gz_list=hdfs.ls(fdr.replace("*",""))
                    dn=os.path.dirname(fdr).strip()
                    bn=os.path.basename(fdr).strip()
                    #print "dn=",dn,",bn=",bn
                    # get all names under folder and do filtering
                    gz_list=fnmatch.filter(hdfs.ls(dn), '*'+bn)
                    #print "gz_list=",gz_list
                else:
                    gz_list=hdfs.ls(fdr)
                cnt=cnt+len(gz_list)
                
                if len(gz_list)>0:
                    rdd_files=rdd_files+comma+fdr
                    comma=","
            except IOError as e:
                print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
            except:
                print "WARNING: Error at checking HDFS file:", sys.exc_info()[0]     
        # use whole folder
        if cnt is None or cnt==0:
            print "ERROR: No file found at",rdd_files
            return -2
        else:
            print "INFO: total file count=",cnt
        # set convert flag only when multiple dir and label_arr has dirty label
        #if label_arr is None: # create label arr if None
        #    label_arr=temp_lbl_list
        if not label_arr is None and len(label_arr)==2 and label_arr[1]=="dirty":
            convert2dirty="Y"
    print "INFO: rdd_files=",rdd_files

    txt_rdd=sc.textFile(rdd_files)#, use_unicode=False
    
    total_input_count=txt_rdd.count()
    print "INFO: Total input sample count=",total_input_count
    # debug only
    #for x in txt_rdd.collect():
    #    print "t=",x
    print "INFO: hdfs_dir_list=",hdfs_dir_list
    print "INFO: label_arr=",label_arr
    print "INFO: feature_count_threshold=",feature_count_threshold
    
    #jkey_dict={"meta_list":["label","md5","mdate"], "data_key":"logs"}
    #   this dict depends on the format of input data
    if not data_field_list is None:
        jkey_dict=json.loads(jkey_dict)
        
        data_key=jkey_dict["data_key"]
        meta_list=jkey_dict["meta_list"]
        
        metadata_count=len(meta_list)
        data_idx=metadata_count
        print "INFO: jkey_dict=",jkey_dict
        print "INFO: meta_list=",meta_list
        print "INFO: data_key=",data_key
        print "INFO: data_field_list=",data_field_list
        print "INFO: metadata_count=",metadata_count

        featured_rdd = txt_rdd \
            .map(lambda x: preprocess_json(x,meta_list,data_key,data_field_list)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()
            
        #print "INFO: featured_rdd="
        #for x in featured_rdd.collect():
        #    print "INFO: **** f=",x
    # user custom code for featuring  ============================================= ==========
    #   input txt_rdd format (string):  each text row for each sample 
    #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    elif not cust_featuring is None and len(cust_featuring)>0:
        user_module=None
        user_func=None
        user_func_dnn=None
        # load user module =======
        try:
            modules = map(__import__, [CUSTOM_PREFIX+cust_featuring])
            user_module=modules[0]
            user_func=getattr(user_module,CUSTOM_FUNC)
        except Exception as e:
            print "ERROR: user module error.", e.__doc__, e.message
            return -101
        # prepare for dnn, output as feat in an array    
        tmp_rdd = txt_rdd.map(lambda x: user_func(x, cust_featuring_params)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list).cache()
        # for traditional ML, feat in a dict 
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()
                
        all_hashes_cnt_dic=None
        all_hash_str_dic=None
        all_hashes_seq_dic = None
    else:
        print "INFO: pattern_str=",pattern_str+"<--"
        print "INFO: ln_delimitor=",ln_delimitor+"<--"
        print "INFO: label_idx=",label_idx
        print "INFO: data_idx=",data_idx
        print "INFO: metadata_count=",metadata_count
        print "INFO: filter_ratio=",filter_ratio        
        
        # filter top and least percentage of feature
        if not filter_ratio is None and filter_ratio > 0 and filter_ratio <1:
            # check total count here before continue
            upper_cnt=total_input_count*(1-filter_ratio)
            lower_cnt=total_input_count*filter_ratio
            # set limit for lower bound. if total count is large, lower_cnt may exclude all features...
            # max lower count =  min( MAX_FILTER_LOWER_CNT, total_input_count/100 ) 
            if not MAX_FILTER_LOWER_CNT is None and lower_cnt > MAX_FILTER_LOWER_CNT:
                if MAX_FILTER_LOWER_CNT > total_input_count/100:
                    lower_cnt=total_input_count/100
                else:
                    lower_cnt=MAX_FILTER_LOWER_CNT


            print "INFO: filtering by count, upper bound=",upper_cnt,",lower bound=",lower_cnt
            # find unique feature, count them, remove them if in highest and lowest % and then create a dict 
            f_feat_set = Set (txt_rdd.map(lambda x:x.split(ln_delimitor)).flatMap(lambda x:Set(x[metadata_count:])) \
                .map(lambda x:(x,1)).reduceByKey(lambda a, b: a + b) \
                .filter(lambda x:x[1]<= upper_cnt and x[1]>= lower_cnt) \
                .map(lambda x:x[0]).collect() )
                
            print "INFO: f_feat_set len=",len(f_feat_set)
            broadcast_f_set = sc.broadcast(f_feat_set)

            #txt_rdd=txt_rdd.map(lambda x: filter_by_list(x, metadata_count,ln_delimitor, broadcast_f_list.value ))
            txt_rdd=txt_rdd.map(lambda x: x.split(ln_delimitor)) \
                        .map(lambda x: x[:metadata_count]+ [w for w in x[metadata_count:] if w and w in broadcast_f_set.value]) \
                        .map(lambda x: ln_delimitor.join(x))
        
        
        # preprocess by pattern matching and then extract n-gram features   #.encode('UTF8')
        #   input txt_rdd format (string):  meta-data1\tmeta-data2\t...\tdataline1\tdataline2\t...datalineN\n
        #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
        #       hash_cnt_dic: {hash,hash:count,...}  hash_str_dic: {hash: 'str1',... }
        tmp_rdd = txt_rdd \
            .map(lambda x: preprocess_pattern(x, metadata_count, pattern_str, ln_delimitor \
                                                , label_idx, label_arr, convert2dirty )) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) #.cache() memory issue...
        #tmp_rdd_count=tmp_rdd.count()
        #print "INFO: After preprocessing count=",tmp_rdd_count
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()
        #feat_rdd_count=featured_rdd.count()
        #print "INFO: After featuring count=",feat_rdd_count

        all_hashes_cnt_dic=None
        all_hash_str_dic=None
        all_hashes_seq_dic = None
    
    #get all hashes and total occurring count ===============
    #   all_hashes_cnt_dic: {'hash,hash': total count,... }
    if all_hashes_cnt_dic is None:
        #all_hashes_cnt_dic = featured_rdd.map(lambda x: x[metadata_count]).reduce(lambda a, b: combine_dic_cnt(a, b))
        all_hashes_cnt_dic = dict(featured_rdd.flatMap(lambda x: x[metadata_count].items()).reduceByKey(lambda a, b: a + b).collect())
    
    #get all hashes and their extracted string  ===============
    #   all_hash_str_dic: {hash:'str1', ...
    if all_hash_str_dic is None:
        #all_hash_str_dic = featured_rdd.map(lambda x: x[metadata_count+1]).reduce(lambda a, b: combine_dic(a, b))
        all_hash_str_dic = dict(featured_rdd.flatMap(lambda x: x[metadata_count+1].items()).distinct().collect())
    
    # get all labels into an array  =============== provided by parameter?
    if label_arr is None:
        # will force "clean" be 0 here
        label_arr=sorted(featured_rdd.map(lambda x: x[label_idx].lower()).distinct().collect())
        # debug only
        print "INFO: label_arr=",json.dumps(sorted(label_arr))
    
    # save labels to hdfs as text file==================================== ============
    hdfs_folder = hdfs_feat_dir #+ "/"   # "/" is needed to create the folder correctly
    print "INFO: hdfs_folder=", hdfs_folder
    try:
        hdfs.mkdir(hdfs_folder)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0]     
    
    # clean up metadata_file
    metadata_file = os.path.join(hdfs_folder , metadata) #"metadata"
    print "INFO: metadata_file=", metadata_file
    try:
        hdfs.rmr(metadata_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at rmr():", sys.exc_info()[0]     
    sc.parallelize(label_arr,1).saveAsTextFile(metadata_file)
    
    #remap all hash values to continuous key/feature number ==============
    #     all_hashes_seq_dic: { hash : sequential_numb }
    if all_hashes_seq_dic is None:
        all_hashes_seq_dic={}
        remap2seq(all_hashes_cnt_dic, all_hashes_seq_dic)   #all_hashes_seq_dic has continuous key number
    #print "all_hashes_seq_dic=",all_hashes_seq_dic
    total_feature_numb=len(all_hashes_seq_dic)
    print "INFO: Total feature count=", len(all_hashes_seq_dic)

    # featured_rdd (list):    [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    # seq_featured_rdd(list): [meta-data1,meta-data2,..., hash_cnthsh_dict, hash_str_dic] (feat id in sorted sequence)
    # hash_cnt_dic: {hash: count}  hash_str_dic: {hash: 'str1,str2...' }
    #     set binary_flag to True, all feature:value will be 1
    broadcast_dic = sc.broadcast(all_hashes_seq_dic)
    seq_featured_rdd = featured_rdd.map(lambda x: convert2seq(x,label_idx,data_idx,broadcast_dic.value,binary_flag= True)).cache() 
    
    # get hash_cnthsh_dict then flatMap and reduce to (feat id, count)
    ct_rdd=seq_featured_rdd.flatMap(lambda x: [(i[0],i[1]) for i in x[data_idx].iteritems()]).reduceByKey(lambda a, b: a + b)
    # sorted by feature id as int
    feat_sample_count_arr=ct_rdd.sortBy(lambda x:int(x[0])).map(lambda x:x[1]).collect()
    # sort after collect may fail when rdd is huge
    #feat_sample_count_arr=[]
    #for i in sorted(ct_rdd.collect(), key=lambda t: int(t[0])):
    #    feat_sample_count_arr.append(i[1])
    print "INFO: feat_sample_count_arr len=",len(feat_sample_count_arr)
    
    # save feat_sample_count_arr data ==================================== ============
    filter='{"rid":'+row_id_str+',"key":"feat_sample_count_arr"}'
    upsert_flag=True
    jo_insert={}
    jo_insert["rid"]=eval(row_id_str)
    jo_insert["key"]="feat_sample_count_arr"
    jo_insert["value"]=feat_sample_count_arr
    jstr_insert=json.dumps(jo_insert)
    ret=query_mongo.upsert_doc_t(mongo_tuples,filter,jstr_insert,upsert_flag)
    print "INFO: Upsert count for feat_sample_count_arr=",ret
    # insert failed, save to local
    if ret==0:
        # drop old record in mongo
        ret=query_mongo.delete_many(mongo_tuples,None,filter)
        if not os.path.exists(local_out_dir):
            os.makedirs(local_out_dir)
        fsca_hs=os.path.join(local_out_dir,row_id_str,row_id_str+"_feat_sample_count_arr.pkl")
        print "WARNING: save feat_sample_count_arr to local"
        ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs)
   
    # save feature data; TBD. not used. ==================================== ============
    
    #libsvm_rdd=seq_featured_rdd.map(lambda x: convert2libsvm(x,label_idx,data_idx,label_arr))
    # put hash to the front of each row, assume hash is after label
    libsvm_rdd=seq_featured_rdd.map(lambda x: x[label_idx+1]+" "+convert2libsvm(x,label_idx,data_idx,label_arr))
    # debug only
    #print "libsvm_rdd="
    #for i in libsvm_rdd.collect():
    #    print i

    # get rdd statistics info
    stats= featured_rdd.map(lambda p: len(p[metadata_count])).stats()
    feat_count_max=stats.max()
    feat_count_stdev=stats.stdev()
    feat_count_mean=stats.mean()
    sample_count=stats.count()
    print "INFO: libsvm data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev
    print "INFO:   ,max feature count=",feat_count_max
    # find sample count
    lbl_arr=featured_rdd.map(lambda x: (x[label_idx],1)).reduceByKey(add).collect()
    print "INFO: Sample count by label=",lbl_arr

    
    # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx
    if remove_duplicated=="Y":
        libsvm_rdd=libsvm_rdd \
            .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \
            .groupByKey().map(lambda x: list(x[1])[0] ) \
            .cache()        
        cnt_list= libsvm_rdd.map(lambda x: (x.split(' ')[1],1)).reduceByKey(add).collect()
        stats= libsvm_rdd.map(lambda x: len(x.split(' ')[metadata_count:])).stats()
        feat_count_max=stats.max()
        feat_count_stdev=stats.stdev()
        feat_count_mean=stats.mean()
        sample_count=stats.count()
        print "INFO: Non-Duplicated libsvm data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev
        print "INFO:   ,max feature count=",feat_count_max
        print "INFO: Non-Duplicated Label count list=",cnt_list
        
    # save libsvm data ==================================== ============
    libsvm_data_file = os.path.join(hdfs_folder , libsvm_alldata_filename) #"libsvm_data"
    print "INFO: libsvm_data_file=", libsvm_data_file
    try:
        #hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(libsvm_data_file)
        if num_gram == 1: 
            hdfs.rmr(dnn_data_file)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info()[0]     
    
    #codec = "org.apache.hadoop.io.compress.GzipCodec"
    #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec)  
    libsvm_rdd.saveAsTextFile(libsvm_data_file) # TBD encrypted
    
    feat_count_file = libsvm_data_file+"_feat_count"
    print "INFO: feat_count_file=", feat_count_file
    try:
        hdfs.rmr(feat_count_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at feat_count clean up".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info()[0]     
    sc.parallelize([total_feature_numb],1).saveAsTextFile(feat_count_file)

    label_dic = {}
    # assign label a number
    for idx, label in enumerate(sorted(label_arr)):
        if not label in label_dic:
            label_dic[label] = idx      #starting from 0, value = idx, e.g., clean:0, dirty:1
    
    # output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN ===========
    if num_gram == 1: # special flag to tokenize and keep input orders
        print "INFO: processing data for DNN..."
        # create token dict
        # str_hash_dict: string to hash
        # all_hashes_seq_dic: hash to seq id
        if token_dict is None or len(token_dict)==0:
            token_dict={}
            str_hash_dict={v: k for k, v in all_hash_str_dic.iteritems()}
            for k,v in str_hash_dict.iteritems():
                token_dict[k]=int(all_hashes_seq_dic[str(v)])
            #print "token_dict=",len(token_dict),token_dict
        
        dnn_rdd = tmp_rdd \
            .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) 
            #.cache()
            # filter duplication here
        #print dnn_rdd.take(3)
        dnn_data_file = os.path.join(hdfs_folder , dnn_alldata_filename) #"dnn_data"
        print "INFO: dnn_data_file=", dnn_data_file
        try:
            hdfs.rmr(dnn_data_file)
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format(e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info()[0]
        try:
            dnn_rdd.saveAsTextFile(dnn_data_file)
        except:
            print "WARNING: Unexpected error at saving dnn data:", sys.exc_info()[0]
                
        try:
            stats= dnn_rdd.map(lambda p: len(p[metadata_count])).stats()
            feat_count_max=stats.max()
            feat_count_stdev=stats.stdev()
            feat_count_mean=stats.mean()
            sample_count=stats.count()
            print "INFO: DNN data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev
            print "INFO:   ,max feature count=",feat_count_max
        except:
            print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info()[0]

        
    
    # clean up pca data in hdfs ============ ========================
    pca_files= '*'+libsvm_alldata_filename+"_pca_*"
    #print "INFO: pca_files=", pca_files
    try:
        f_list=hdfs.ls(hdfs_folder)
        if len(f_list)>0:
            df_list=fnmatch.filter(f_list,pca_files)
            for f in df_list:
                print "INFO: rm ",f
                hdfs.rmr(f)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info()[0]     

    # clean up pca data in web local ============ ========================       
    pca_fname=os.path.join(model_data_folder , row_id_str+'_pca_*.pkl*')
    print "INFO: pca_fname=", pca_fname
    
    try:
        for fl in glob.glob(pca_fname):
            print "INFO: remove ", fl
            os.remove(fl)
    except OSError, e:
        print ("Error: %s - %s." % (e.pca_fname,e.strerror))
Пример #15
0
'''
Prior Audience Sample, to compare levels
0 = Weak
1 = Medium
2 = Strongest
'''


def _getSample():
    return random.randrange(MIN_VAL, MAX_VAL + 1)


prior = 1  #initiliaze the richard
jokesAndResponses = {}  #set of told jokes, and their response
jokesTold = 0
heuristics = Set()
'''
Main Function Here
'''


class heuristic(object):
    def __init__(self, t):
        self.type = t
        self.fails = 0
        self.prob = 100  #out of a hundred percent

    def getInfo(self):
        return [self.type, self.fails, self.prob]

    def getType(self):
Пример #16
0
def readGraph(file, n, p, mean, std_dev, PView, PShare, content_count):
    G1 = LoadEdgeList(PUNGraph, file, 0, 1)
    n = G1.GetNodes()
    CmtyVt = TCnComV()

    #Getting the Community
    CommunityCNM(G1, CmtyVt)
    nodes = TIntV()
    for N in G1.GetNI(0).GetOutEdges():
        nodes.Add(N)

    G1 = GetSubGraph(G1, nodes)

    #Drawing the original Community Graph
    #DrawGViz(G1, gvlDot, "graph1.png", "graph 1")

    Graph = {}
    for u in G1.Nodes():
        for v in u.GetOutEdges():
            if Graph.has_key(u.GetId()):
                Graph[u.GetId()].add(v)
            else:
                Graph.update({u.GetId(): Set([v])})

    #Initialize the probability Vectors.
    for i in range(0, n):
        PView += [[0 for j in range(0, n)]]
        PShare += [[0 for j in range(0, n)]]

    #Populating the probability vectors.
    for v in G1.Nodes():
        for u in v.GetOutEdges():
            id_src = v.GetId()
            id_dst = u
            view_prob = np.random.binomial(n, p, 1)[0] / (n * 1.0)
            share_prob = np.random.binomial(n, p, 1)[0] / (n * 1.0)
            PView[id_dst][id_src] = view_prob
            PShare[id_dst][id_src] = share_prob

    #Content forest each entry in this array is a forest for some content.
    content_forest = []

    # 4039 * 0.148 #Number of cotent introduction point.
    content_intro_count = 60

    #Generating forest for each content.
    for i in range(0, content_count):

        #Generating the random introduction points i.e. the users who introduce the content.
        random_sample = random.sample(range(0, n), content_intro_count)

        #Generating the forest for a content
        content_forest.append(BFS(Graph, random_sample, PView, PShare))

    new_Graph = {}
    weight = {}
    G2 = TNGraph.New()
    for i in range(0, n):
        G2.AddNode(i)

    #Generating the inferred graph
    for cf in content_forest:
        for e in cf.Edges():
            if new_Graph.has_key(e.GetSrcNId()) and e.GetDstNId() in new_Graph[
                    e.GetSrcNId()]:
                weight[str(e.GetSrcNId()) + ',' +
                       str(e.GetDstNId())] = weight[str(e.GetSrcNId()) + ',' +
                                                    str(e.GetDstNId())] + 1
            elif new_Graph.has_key(e.GetSrcNId()):
                new_Graph[e.GetSrcNId()].add(e.GetDstNId())
                weight.update(
                    {str(e.GetSrcNId()) + ',' + str(e.GetDstNId()): 1})
                G2.AddEdge(e.GetSrcNId(), e.GetDstNId())
            else:
                new_Graph.update({e.GetSrcNId(): Set([e.GetDstNId()])})
                weight.update(
                    {str(e.GetSrcNId()) + ',' + str(e.GetDstNId()): 1})
                G2.AddEdge(e.GetSrcNId(), e.GetDstNId())

    #Sum of Weights of all the neighbours of a vertex
    TWeight = {}
    for u in new_Graph.keys():
        sum = 0
        for v in new_Graph[u]:
            sum += weight[str(u) + ',' + str(v)]
            TWeight.update({u: sum})

#Calculating Edge Confidence
    for u in new_Graph.keys():
        for v in new_Graph[u]:
            weight[str(u) + ',' +
                   str(v)] = weight[str(u) + ',' + str(v)] / (TWeight[u] * 1.0)

    #calculating conf_threshold
    conf_thershold = mean + 0 * std_dev

    #Generating the graph whose edges have conf_value greate than conf_threshold
    G3 = TUNGraph.New()
    nodes = []
    for u in new_Graph.keys():
        for v in new_Graph[u]:
            if weight[str(u) + ',' + str(v)] >= conf_thershold:
                if u not in nodes:
                    G3.AddNode(u)
                    nodes += [u]
                if v not in nodes:
                    G3.AddNode(v)
                    nodes += [v]
                G3.AddEdge(u, v)

    #Drawing the inferred Graph
    DrawGViz(G3, gvlDot, "graph3.png", "graph 2")
Пример #17
0
 def __init__(self, h):
     self.allH = Set()
     for elm in h:
         self.allH.add(heuristic(elm))
class ExternalProgramTestSuite:
    """ A Class for creating Test Suites with
    test cases which call external programs 
    """
    # internal static variables
    _test_suites = {}
    _num_formatting_chars = 100
    _all_log_files = Set()
    _has_run = False
    _framework_output_file = None
    # public static variables
    color_output_text = True
    suite_header_color = Fore.MAGENTA
    case_header_color = Fore.CYAN
    suite_result_header_color = Fore.YELLOW

    def __init__(self, **kwargs):
        # reset the suite variables
        self._set_suite_defaults()
        # test suite name is the name of the suite class
        # or the suite_name arg if passed
        if 'suite_name' in kwargs:
            if assert_variable_type(kwargs['suite_name'], str):
                self.suite_name = kwargs['suite_name']
        else:
            # suite name defaults to class name
            self.suite_name = self.__class__.__name__
        # add test suite to class static total list
        try:
            # make sure a suite with the same
            # name does not already exist
            if self.suite_name not in ExternalProgramTestSuite._test_suites:
                ExternalProgramTestSuite._test_suites[self.suite_name] = {
                    'self': self,
                    'name': self.suite_name,
                    'description': self.suite_description,
                    'args': kwargs,
                    'num_passed': 0,
                    'num_tests': 0,
                    'num_checks': 0,
                    'num_checks_passed': 0,
                    'execution_time': 0,
                    'has_run': False,
                    'pass_threshold': 100,
                    'passed': False
                }
            else:
                raise ValueError(
                    'A suite with the name "%s" already exists. '
                    'Please rename one of suite classes or pass a unique "suite_name" argument to one or both of the constructors.'
                )
        except ValueError as e:
            raise Exception('[%s] %s' % (type(e).__name__, e))

    def log(self, print_string, error=False, color=Fore.RESET):
        """Wrapper over print function to allow writing
        test framework output to file if desired.
        """
        # write the print output to the log files
        if self.log_framework_output:
            if error and self.stderr_file is not None:
                with open(self.stderr_file, 'a') as f:
                    f.write(print_string + "\r\n")
            elif self.stdout_file is not None:
                with open(self.stdout_file, 'a') as f:
                    f.write(print_string + "\r\n")
        # print the output and color appropriately
        if ExternalProgramTestSuite.color_output_text:
            print(color + print_string + Fore.RESET + Back.RESET +
                  Style.RESET_ALL)
        # Aptana's interactive console doesn't accept ANSI escape
        # characters but at least it colors the stderr red so
        # separate normal output from error output appropriately
        else:
            if error:
                sys.stderr.write(print_string + "\r\n")
                sys.stderr.flush()
            else:
                sys.stdout.write(print_string + "\r\n")
                sys.stdout.flush()

    def _set_suite_defaults(self):
        """Set the suite variables to their defaults
        """
        # set the default suite variables
        # default suite name and description
        self.suite_name = None
        self.suite_description = None
        # number of passed test cases
        self._num_tests_passed = 0
        # num checks and failures
        self._total_checks_passed = 0
        self._total_checks = 0
        # threshold in percentage of tests
        # passed to decide status of suite
        self.suite_pass_threshold = 100
        # whether to truncate the log file
        # before writing to it
        self.overwrite_log_file = True
        # whether to print process output
        # or just write it to the log file
        self.print_process_output = True
        self.log_framework_output = False
        # default log path
        self._default_log_file = "run.log"
        self.stdout_file = self.stderr_file = self._default_log_file
        # setup and teardown function
        self._suite_setup = None
        self._suite_teardown = None
        # timelimit values
        self._suite_timelimit_met = True
        self.suite_timelimit = None
        self.suite_case_timelimit = None
        # invalid args list
        self._invalid_args = []

    def _set_case_defaults(self):
        """ 
        Set the case variables to their defaults
        """
        # default test case variables
        self._name = None
        # whether to print process output
        # or just write it to the log file
        self.print_case_output = self.print_process_output
        # default log path
        self.stdout_file = self.stderr_file = self._default_log_file
        # default case description
        self._description = None
        # num checks and failures
        self._num_checks_passed = 0
        self._num_checks = 0
        # threshold in percentage of checks
        # passed to decide status of case
        self.case_pass_threshold = 100
        # test case time limit
        self._timelimit = self.suite_case_timelimit
        # wait to print case header
        self._wait_sem = 0
        # fixture, setup, teardown
        self._fixture = None
        self._case_setup = None
        self._case_teardown = None

    def _setup_suite(self, **kwargs):
        """ 
        Set the suite variables
        """
        # if a test suite requires common variables across all test cases,
        # they can be passed through kwargs and are set here
        for key, value in kwargs.items():
            if type(value) is str:
                exec('self.' + str(key) + '="' + value +
                     '"') in globals(), locals()
            else:
                exec('self.' + str(key) + '=' +
                     str(value)) in globals(), locals()
        # each function in a test suite class is a test case
        # so get the cases and add them to the testSuites list
        test_names = {
            key: value
            for key, value in self.__class__.__dict__.items()
            if isinstance(value, FunctionType)
        }
        self.test_cases = []
        for name in test_names:
            if name == "setup":
                self._suite_setup = getattr(self, name)
            elif name == "teardown":
                self._suite_teardown = getattr(self, name)
            elif 'fixture' not in name.lower():
                self.test_cases.append(name)

    def _setup_case(self):
        # if a suite has startted running and the overwrite log file
        # flag was set to True, truncate the log files
        if self.overwrite_log_file and (
                not ExternalProgramTestSuite._has_run or len([
                    (x) for x in [self.stdout_file, self.stderr_file]
                    if x not in ExternalProgramTestSuite._all_log_files
                ]) > 0):
            for log_file in [self.stdout_file, self.stderr_file]:
                ExternalProgramTestSuite._all_log_files.add(log_file)
                with open(log_file, 'w') as f:
                    f.truncate(0)

    def _end_case(self):
        # call fixture teardown if set
        if self._case_teardown is not None:
            if isinstance(self._case_teardown, MethodType):
                self._case_teardown(self)
            else:
                self._case_teardown()

    def _validate_argument(self, argument, types):
        key = argument.keys()[0]
        valid, message = assert_variable_type(argument[key], types, False)
        if not valid:
            self._invalid_args.append("%s: %s" % (key, message))

    def _validate_suite_arguments(self):
        """ 
        Validate test suite argument types
        """
        # reset invalid args list
        self._invalid_args = []
        #string
        string_vars = [{
            "suite_description": self.suite_description
        }, {
            "stdout_file": self.stdout_file
        }, {
            "stderr_file": self.stderr_file
        }]
        [self._validate_argument(x, [str, NoneType]) for x in string_vars]
        # bool
        bool_vars = [{
            "overwrite_log_file": self.overwrite_log_file
        }, {
            "print_process_output": self.print_process_output
        }, {
            "log_framework_output": self.log_framework_output
        }]
        [self._validate_argument(x, bool) for x in bool_vars]
        # float
        float_vars = [{
            "suite_timelimit": self.suite_timelimit
        }, {
            "suite_case_timelimit": self.suite_case_timelimit
        }]
        [
            self._validate_argument(x, [int, float, NoneType])
            for x in float_vars
        ]
        # functions
        function_vars = [{
            "suite setup": self._suite_setup
        }, {
            "suite teardown": self._suite_teardown
        }]
        [
            self._validate_argument(x, [MethodType, NoneType])
            for x in function_vars
        ]
        # raise exception if any invalid args
        if len(self._invalid_args) > 0:
            raise InvalidArgument(('\r\n').join(self._invalid_args))

    def _validate_test_arguments(self):
        """ 
        Validate test case argument types
        """
        # reset invalid args list
        self._invalid_args = []
        #string
        string_vars = [{
            'description': self._description
        }, {
            'name': self._name
        }]
        [self._validate_argument(x, [str, NoneType]) for x in string_vars]
        # float
        float_vars = [{'timelimit': self._timelimit}]
        [
            self._validate_argument(x, [int, float, NoneType])
            for x in float_vars
        ]
        # fixture
        try:
            if self._fixture is not None:
                self._case_setup, self._case_teardown = self._fixture()
        except Exception:
            self._invalid_args.append(
                'a proper fixture returning a setup and teardown function was not provided'
            )
        # functions (fixture override)
        function_vars = [{
            'case setup': self._case_setup
        }, {
            'case teardown': self._case_teardown
        }]
        [
            self._validate_argument(x, [FunctionType, MethodType, NoneType])
            for x in function_vars
        ]
        # raise exception if any invalid args
        if len(self._invalid_args) > 0:
            raise InvalidArgument(('\r\n').join(self._invalid_args))

    def run(self, suite_name=None):
        """
        Run the test suite
        """
        # capture start time
        suite_start_time = timeit.default_timer()
        # setup suite
        if suite_name is None:
            suite_name = self.suite_name
        self._setup_suite(
            **ExternalProgramTestSuite._test_suites[suite_name]['args'])
        # validate suite args
        try:
            self._validate_suite_arguments()
        except Exception as e:
            ExternalProgramTestSuite._test_suites[
                self.suite_name]['has_run'] = True
            raise SuiteError('Error in test suite "%s" [%s] %s' %
                             (suite_name, type(e).__name__, e))
        # run all the test cases
        for index, case in enumerate(sorted(self.test_cases)):
            self.test_case = getattr(self, case)
            if not self.test_case:
                raise Exception("Test Case %s does not exist" %
                                str(self.test_case))
            # reset the default suite/case variables
            self._set_case_defaults()
            # set test case name to case
            self._name = case
            # suite setup routine
            self._setup_case()
            # print test suite name and descripion if any
            # and if first loop through cases
            if index == 0:
                self.log("=" * ExternalProgramTestSuite._num_formatting_chars)
                self.log("TEST SUITE: %s" % suite_name, False,
                         ExternalProgramTestSuite.suite_header_color)
                if self.suite_description:
                    self.log("Description: %s" % (self.suite_description))
                    ExternalProgramTestSuite._test_suites[suite_name][
                        'description'] = self.suite_description
                # call suite setup function if set
                if self._suite_setup is not None:
                    self._suite_setup()
            # run the test case
            try:
                self._run_test_case()
            except Exception as e:
                self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
            # set has_run flags
            ExternalProgramTestSuite._has_run = True
            # set suite attributes for static _test_suites list
            ExternalProgramTestSuite._test_suites[
                self.suite_name]['has_run'] = True
            ExternalProgramTestSuite._test_suites[
                self.suite_name]['pass_threshold'] = self.suite_pass_threshold
            # end case routine
            self._end_case()
        # capture suite end time
        suite_end_time = timeit.default_timer()
        suite_time_taken = suite_end_time - suite_start_time
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['execution_time'] = suite_time_taken
        # if a timelimit was set
        # check if it was met
        if self.suite_timelimit is not None:
            self.log("_" * ExternalProgramTestSuite._num_formatting_chars)
            if suite_time_taken <= self.suite_timelimit:
                self.log(
                    'CHECK PASS: suite completed before time limit of %.4f' %
                    self.suite_timelimit, False, Back.GREEN)
                self._total_checks_passed += 1
            else:
                self.log(
                    'CHECK FAIL: suite did not complete before time limit of %.4f'
                    % self.suite_timelimit, True, Back.RED)
                self._suite_timelimit_met = False
            self._total_checks += 1
        # call suite teardown function if set
        if self._suite_teardown is not None:
            self._suite_teardown()
        # print test result
        self._print_suite_results()

    def case_header(self):
        """ Test case header output 
        """
        # print case name
        self.log("-" * ExternalProgramTestSuite._num_formatting_chars)
        self.log("CASE: %s" % self._name, False,
                 ExternalProgramTestSuite.case_header_color)
        # print description if any
        if self._description is not None:
            self.log("Description: %s" % (str(self._description)))
        self.log("-" * ExternalProgramTestSuite._num_formatting_chars)
        # validate args
        try:
            self._validate_test_arguments()
        except Exception as e:
            self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
        # call fixture setup if set
        if self._case_setup is not None:
            if isinstance(self._case_setup, MethodType):
                self._case_setup(self)
            else:
                self._case_setup()

    def _run_test_case(self):
        """
        Run an individual test case
        """
        # read source file to see decorators and
        # call case_header at the right time
        test_function = self._name
        suite_class = str(self.__class__).rpartition('.')[2]
        lines = []
        save_lines = False
        with open(inspect.getmodule(self.__class__).__file__) as f:
            for line in f:
                if suite_class in line:
                    save_lines = True
                if save_lines and test_function in line:
                    break
                if save_lines and 'def ' in line:
                    lines = []
                if (save_lines and len(line.strip()) > 0
                        and line.strip()[0] == "@"):
                    lines.append(line.strip().rpartition('(')[0])
        # semaphore to wait for calling
        # case_header after all decorators
        self._wait_sem = len(lines)
        # if semaphor is 0 print case header immediately
        if self._wait_sem == 0:
            self.case_header()
        # run test case
        execution_time = timeit.timeit(self.test_case, number=1)
        # if a timelimit was set
        # check if it was met
        if self._timelimit is not None:
            if execution_time <= self._timelimit:
                self.log(
                    'CHECK PASS: test completed before time limit of %.4f' %
                    self._timelimit, False, Back.GREEN)
                self._num_checks_passed += 1
            else:
                self.log(
                    'CHECK FAIL: test did not complete before time limit of %.4f'
                    % self._timelimit, True, Back.RED)
            self._num_checks += 1
        # print pass/fail, execution time
        if self._num_checks > 0:
            percentage_passed = (self._num_checks_passed * 1.0 /
                                 self._num_checks) * 100
        else:
            percentage_passed = 0
        output_string = ("%d/%d (%.2f%%) CHECKS in %.4f seconds" %
                         (self._num_checks_passed, self._num_checks,
                          percentage_passed, execution_time))
        if percentage_passed >= self.case_pass_threshold or self._num_checks == 0:
            output_string += " TEST PASS"
            if self.case_pass_threshold != 100:
                output_string += " with %.2f%% threshold" % self.case_pass_threshold
            self.log(output_string, False, Back.GREEN)
            self._num_tests_passed += 1
        else:
            output_string += " TEST FAIL"
            self.log(output_string, False, Back.RED)
        self._total_checks += self._num_checks
        self._total_checks_passed += self._num_checks_passed

    def _print_suite_results(self):
        self.log("*" * ExternalProgramTestSuite._num_formatting_chars)
        self.log("SUITE RESULT", False,
                 ExternalProgramTestSuite.suite_result_header_color)
        self.log("*" * ExternalProgramTestSuite._num_formatting_chars)
        passed = self._print_info_and_status()
        self.log("=" * ExternalProgramTestSuite._num_formatting_chars)
        # add test result to class static suite list
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['num_tests'] = len(self.test_cases)
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['num_passed'] = self._num_tests_passed
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['passed'] = passed
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['num_checks'] = self._total_checks
        ExternalProgramTestSuite._test_suites[
            self.suite_name]['num_checks_passed'] = self._total_checks_passed

    def _print_info_and_status(self, suite_name=""):
        num_tests = len(self.test_cases)
        passed = False
        try:
            if num_tests > 0:
                percentage_tests_passed = (self._num_tests_passed * 1.0 /
                                           num_tests) * 100
            else:
                percentage_tests_passed = 0
            if self._total_checks > 0:
                percentage_checks_passed = (self._total_checks_passed * 1.0 /
                                            self._total_checks) * 100
            else:
                percentage_checks_passed = 0
            output_string = (
                "%s%d/%d (%.2f%%) TESTS with %d/%d (%.2f%%) CHECKS in %.4f seconds"
                % (suite_name, self._num_tests_passed, num_tests,
                   percentage_tests_passed, self._total_checks_passed,
                   self._total_checks, percentage_checks_passed,
                   ExternalProgramTestSuite._test_suites[
                       self.suite_name]['execution_time']))
            if percentage_tests_passed >= self.suite_pass_threshold and self._suite_timelimit_met:
                output_string += " OK"
                if self.suite_pass_threshold != 100:
                    output_string += " with %.2f%% threshold" % self.suite_pass_threshold
                self.log(output_string, False, Back.GREEN)
                passed = True
            else:
                output_string += " NOT OK"
                self.log(output_string, False, Back.RED)
        except Exception as e:
            self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
        return passed

    def check_subprocess(self,
                         executable_command,
                         command_arguments,
                         expected_returncode,
                         timeout=None,
                         print_process_output=True,
                         stdout_file=None,
                         stderr_file=None,
                         poll_seconds=.100):
        process = None
        try:
            process, execution_time = run_subprocess(
                executable_command, command_arguments, timeout,
                print_process_output, stdout_file, stderr_file, poll_seconds)
        except OSError as e:
            self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
        except ValueError as e:
            self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
        except TimeoutError as e:
            self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED)
        # print pass/fail, execution time
        if process is not None:
            if process.returncode == expected_returncode:
                self.log('CHECK PASS', False, Back.GREEN)
                self._num_checks_passed += 1
            else:
                self.log('CHECK FAIL', True, Back.RED)
            self.log("%.4f seconds" % (execution_time))
        else:
            self.log('CHECK FAIL', True, Back.RED)
        self._num_checks += 1

    @staticmethod
    def run_all():
        """
        Run all registered test suites that have run
        """
        ExternalProgramTestSuite._has_run = False
        for suite, properties in ExternalProgramTestSuite._test_suites.items():
            try:
                ExternalProgramTestSuite.run(properties['self'],
                                             properties['name'])
            except Exception as e:
                properties['self'].log('[%s] %s' % (type(e).__name__, e), True,
                                       Fore.RED)
                # print test result
                properties['self']._print_suite_results()
        ExternalProgramTestSuite.print_total_results()

    @staticmethod
    def print_total_results():
        """
        Print the cumulative results from all suites registered and run
        """
        # print results for each suite on one line
        # keep track of test results info for totals
        total_num_tests = 0
        total_num_passed = 0
        total_checks = 0
        total_checks_passed = 0
        total_suites_passed = 0
        total_num_suites = 0
        total_execution_time = 0
        try:
            for index, (suite, results) in enumerate(
                    ExternalProgramTestSuite._test_suites.items()):
                self = results['self']
                if index == 0:
                    self.log("*" *
                             ExternalProgramTestSuite._num_formatting_chars)
                    self.log(
                        "ALL SUITE RESULTS", False,
                        ExternalProgramTestSuite.suite_result_header_color)
                    self.log("*" *
                             ExternalProgramTestSuite._num_formatting_chars)
                if results['has_run']:
                    self._print_info_and_status(suite + ": ")
                    total_num_tests += results['num_tests']
                    total_num_passed += results['num_passed']
                    if total_num_tests > 0 and results['passed']:
                        total_suites_passed += 1
                    total_checks += results['num_checks']
                    total_checks_passed += results['num_checks_passed']
                    total_execution_time += results['execution_time']
                    self.log("_" *
                             ExternalProgramTestSuite._num_formatting_chars)
                    total_num_suites += 1
            # print cumulative total pass/fail
            if total_num_tests > 0:
                if total_checks > 0:
                    percentage_checks_passed = (total_checks_passed * 1.0 /
                                                total_checks) * 100
                else:
                    percentage_checks_passed = 0
                self.log("TOTALS")
                self.log("." * ExternalProgramTestSuite._num_formatting_chars)
                percentage_passed = (total_suites_passed * 1.0 /
                                     total_num_suites) * 100
                self.log(
                    "%d/%d (%.2f%%) SUITES\n%d/%d (%.2f%%) TESTS\n%d/%d (%.2f%%) CHECKS\nin %.4f seconds"
                    % (total_suites_passed, total_num_suites,
                       percentage_passed, total_num_passed, total_num_tests,
                       (total_num_passed * 1.0 / total_num_tests) * 100,
                       total_checks_passed, total_checks,
                       percentage_checks_passed, total_execution_time))
            if percentage_passed == 100:
                self.log("OK", False, Back.GREEN)
            else:
                self.log("NOT OK", False, Back.RED)
            self.log("." * ExternalProgramTestSuite._num_formatting_chars)
        except Exception as e:
            print(Fore.RED + '[%s] %s' % (type(e).__name__, e) + Fore.RESET +
                  Back.RESET + Style.RESET_ALL)
Пример #19
0
def diffList(left, right, path, result):
    for x in range(len(left)):
        path2 = path + '[' + str(x) + ']'
        if x >= len(right):
            result['missingOnRight'].append(path2)
        else:
            diffValue(left[x], right[x], path2, result)
    for x in range(len(left), len(right)):
        path2 = path + '[' + str(x) + ']'
        result['missingOnLeft'].append(path2)


# ---------------------------------------------------------------------------------

ALLOWED_MISSING_ON_RIGHT = Set([".version", ".policyType", ".guid"])


def isPolicyIdentical(old, new):
    result = digdiff(old, new)
    #misc.ppprint(old)
    #misc.ppprint(new)
    debug("missingOnLeft:{}".format(result['missingOnLeft']))
    debug("missingOnRight:{}".format(result['missingOnRight']))
    debug("differsByType:{}".format(result['differsByType']))
    debug("differsByValue:{}".format(result['differsByValue']))
    if len(result['missingOnLeft']) > 0 or len(
            result['differsByType']) > 0 or len(result['differsByValue']) > 0:
        return False
    else:
        for missing in result["missingOnRight"]:
Пример #20
0
from collections import defaultdict
from sets import Set

pos_seedlist=["good", "nice", "love", "excellent", "fortunate", "correct", "superior"]
neg_seedlist=["bad", "nasty", "poor", "hate", "unfortunate", "wrong", "inferior"]

sentences=open('/home/twinkle/NLP/hw3/tweets.txt').read().strip().split("\n")
#one row
#condition to ignore
wordcounts=defaultdict(int)
pair_counts=defaultdict(int)
seedsum=defaultdict(float)
pmi=defaultdict(float)
polarity=defaultdict(float)
total=0
bow=Set()
words=[]
allwords=[]
#make a set, wordsum, break loop for both, 
i=0
i=0
#bla=[]
for sent in sentences:
    print i
    i+=1
    temp=sent.split(' ')
    words.append(list(Set(temp)))
 
    
    
for row in words:
Пример #21
0
def filter_emojis(text):
    return Set(text.replace(':', '')).issubset(allowed_chars) and len(text) > 2
import sys
from sets import Set
import numpy

fdata = open(sys.argv[1])
fcluster = open(sys.argv[2])
fout = open(sys.argv[3], "w")
output_type = int(sys.argv[4])

uid_cid = {}
for line in fcluster:
    uid = int(line.split(" ")[0])
    cid = int(line.split(" ")[1])
    uid_cid[uid] = cid

doc_set = Set()

cluster_ctr = {}
for line in fdata:
    line = line[:-1]
    line_arr = line.split("|")
    user_id = int(line_arr[1])
    if user_id not in uid_cid:
        continue
    cluster_id = uid_cid[user_id]
    shown_doc = line_arr[0].split(" ")[1]
    clicked = int(line_arr[0].split(" ")[2])
    doc_set.add(shown_doc)

    if cluster_id not in cluster_ctr:
        cluster_ctr[cluster_id] = {}
Пример #23
0
class AggDC:
    PassThrough = Set(
                      '''
                         BeginDrawing
                         EndDrawing
                         GetBackground
                         GetSize
                         GetSizeTuple
                         SetBrush
                         SetPen
                      '''.split()
                     )

    def __init__(self, dc):
        self.dc = dc
        self.dc.BeginDrawing()
        w, h = self.dc.GetSizeTuple()
        self.draw = aggdraw.Draw('RGB', (w, h))
        self.draw.rectangle((0, 0, w, h), None, aggBrush(dc.GetBackground()))

    def __del__(self):
        w, h = self.dc.GetSizeTuple()
        if w and h:
            image = wx.EmptyImage(w, h)
            image.SetData(self.draw.tostring())
            self.dc.DrawBitmap(image.ConvertToBitmap(), 0, 0)
        self.dc.EndDrawing()

    def __getattr__(self, attr):
        if attr in self.PassThrough:
            return getattr(self.dc, attr)
        else:
            raise AttributeError("%s instance has no attribute '%s'" % (self.__class__.__name__, attr))


    def CrossHair(self, x, y):
        #self.dc.CrossHair(x, y)

        w, h = self.dc.GetSizeTuple()
        p = aggPen(self.dc.GetPen())
        self.draw.line((0, y, w, y), p)
        self.draw.line((x, 0, x, h), p)


    def DrawArc(self, x1, y1, x2, y2, xc, yc):
        #self.dc.DrawArc(x1, y1, x2, y2, xc, yc)

        b = aggBrush(self.dc.GetBrush())
        p = aggPen(self.dc.GetPen())
        radius = ((xc-x1)**2 + (yc-y1)**2)**0.5
        self.draw.pieslice(
                           (xc-radius, yc-radius, xc+radius, yc+radius),
                           math.degrees(math.atan2(yc-y1, x1-xc)),
                           math.degrees(math.atan2(yc-y2, x2-xc)),
                           p, b
                          )


    def DrawCircle(self, x, y, radius):
        #self.dc.DrawCircle(x, y, radius)

        b = aggBrush(self.dc.GetBrush())
        p = aggPen(self.dc.GetPen())
        self.draw.ellipse((x-radius, y-radius, x+radius, y+radius), p, b)


    def DrawEllipse(self, x, y, width, height):
        #self.dc.DrawEllipse(x, y, width, height)

        b = aggBrush(self.dc.GetBrush())
        p = aggPen(self.dc.GetPen())
        self.draw.ellipse((x, y, x+width, y+height), p, b)


    def DrawLine(self, x1, y1, x2, y2):
        #self.dc.DrawLine(x1, y1, x2, y2)

        p = aggPen(self.dc.GetPen())
        self.draw.line((x1, y1, x2, y2), p)


    def DrawRectangle(self, x, y, width, height):
        #self.dc.DrawRectangle(x, y, width, height)

        b = aggBrush(self.dc.GetBrush())
        p = aggPen(self.dc.GetPen())
        self.draw.rectangle((x, y, x+width, y+height), p, b)


    def DrawPolygon(self, points):
        #self.dc.DrawPolygon(points)
        n = []
        for p in points:
            n += p

        b = aggBrush(self.dc.GetBrush())
        p = aggPen(self.dc.GetPen())
        self.draw.polygon(n, p, b)
Пример #24
0
 def find_certain_child(certain, uncertain, possibles):
     li = []
     for name in Set(certain):
         min_ = minCount(possibles, name) - uncertain.count(name)
         li.extend(min_ * [name])
     return li
Пример #25
0
    def run(self):
        """
		2008-05-08
			transpose everything if output_matrix_type=1 (bjarni's SNP matrix format)
		2007-02-19
			--db_connect
			--get_snp_id2index()
			--get_strain_id2index()
			--get_strain_id_info()
			--get_snp_id_info()
			--get_data_matrix()
			if self.toss_out_rows:
				--toss_rows_to_make_distance_matrix_NA_free()
					--find_smallest_vertex_set_to_remove_all_edges()
			--write_data_matrix()
			#--sort_file()
		2007-09-22
			for mysql_connection
				add get_nativename_snpid2call_m()
				add fill_in_resolved_duplicated_calls()
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.db_connection_type == 1:
            import MySQLdb
            #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
            conn = MySQLdb.connect(db=self.dbname,
                                   host=self.hostname,
                                   user=self.user,
                                   passwd=self.passwd)
            curs = conn.cursor()
            snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \
                         self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \
                         self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants)

            #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
            #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
            if self.input_table == 'dbsnp.calls':
                from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping
                snps_id2mapping = get_snps_id2mapping(self.hostname,
                                                      dbname='dbsnp',
                                                      user=self.user,
                                                      passwd=self.passwd)
            else:
                snps_id2mapping = None
            data_matrix = self.get_data_matrix_m(curs, strain_id2index,
                                                 snp_id2index, nt2number,
                                                 self.input_table,
                                                 self.need_heterozygous_call,
                                                 snps_id2mapping)
            """
			if self.resolve_duplicated_calls:
				nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table)
				data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call)
			"""
            if self.include_other_strain_info:
                strain_id2other_info = self.get_strain_id2other_info(
                    curs, strain_id_list, self.strain_info_table,
                    self.input_table)
            else:
                strain_id2other_info = {}
        elif self.db_connection_type == 2:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            snp_id2index, snp_id_list = self.get_snp_id2index(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list = self.get_strain_id2index(
                curs, self.input_table)

            strain_id2acc, strain_id2category = self.get_strain_id_info(
                curs, strain_id_list, self.strain_info_table)
            snp_id2info = self.get_snp_id_info(curs, snp_id_list,
                                               self.snp_locus_table)
            data_matrix = self.get_data_matrix(curs, strain_id2index,
                                               snp_id2index, nt2number,
                                               self.input_table,
                                               self.need_heterozygous_call)
            strain_id2other_info = {}

        if self.toss_out_rows:
            rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(
                data_matrix)
            rows_to_be_tossed_out = Set(rows_to_be_tossed_out)
        else:
            rows_to_be_tossed_out = Set()

        #05/08/08
        if self.discard_all_NA_strain:
            from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(
                data_matrix, row_cutoff=1)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
            rows_to_be_tossed_out.update(rows_with_too_many_NAs_set)

        strain_acc_list = [
            strain_id2acc[strain_id] for strain_id in strain_id_list
        ]
        category_list = [
            strain_id2category[strain_id] for strain_id in strain_id_list
        ]

        strain_acc2other_info = {}
        for strain_id in strain_id2other_info:
            strain_acc2other_info[
                strain_id2acc[strain_id]] = strain_id2other_info[strain_id]

        if self.output_matrix_type == 1:
            #transpose everything
            data_matrix = num.array(data_matrix)
            data_matrix = num.transpose(data_matrix)

            header = ['Chromosomes', 'Positions'] + strain_acc_list
            chromosome_ls = []
            position_ls = []
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                chromosome_ls.append(chromosome)
                position_ls.append(position)

            strain_acc_list = chromosome_ls
            category_list = position_ls
            cols_to_be_tossed_out = rows_to_be_tossed_out
            rows_to_be_tossed_out = None
            strain_id2other_info = None  #make up one
        else:
            header = ['strain', 'category']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
            cols_to_be_tossed_out = None

        write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \
           cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\
           strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
Пример #26
0
def process_line(task_str, task_data_str):
    # Ignore any responses that make it into the task string
    task_str = mass_str_replace(task_str, response_strs, '')

    # Process task_data_str into component bits
    # For all tasks except learning task, extract spaun's answer
    if task_str in ['A0', 'A1', 'A3', 'A4', 'A5', 'A6', 'A7']:
        # Split the task data string into before and after the question mark
        task_data_split = task_data_str.split('?', 1)

        # The task information is before the question mark
        task_info = task_data_split[0].replace("'", '')
        # Filter out the MNIST digits
        task_info = remove_MNIST_strs(task_info)

        # Record special characters
        has_F = 'F' in task_info
        has_R = 'R' in task_info
        has_P = 'P' in task_info
        has_K = 'K' in task_info

        # Split up the different components of the task info
        task_info_split = task_info.split(']')

        if task_info_split[-1] == '':
            task_info_split = task_info_split[:-1]

        # Remove [ ]'s and special characters from each part of task_info_split
        for i in range(len(task_info_split)):
            task_info_split[i] = \
                mass_str_replace(task_info_split[i],
                                 ['[', ']', 'F', 'R', 'P', 'K', '-'], '')

        # Spaun's answer is after the question mark
        task_answer_spaun = \
            np.array(list(mass_str_replace(task_data_split[1],
                                           response_strs, num_list_strs)))

        if len(task_answer_spaun) == 0:
            return (None, None)

    # ------ Reference answer generation ------
    if task_str in ['A0', 'A1', 'A3']:
        # For copy-draw, classification, memory task
        task_info = np.array(list(task_info_split[0]))
        if has_R:
            task_answer_ref = task_info[-1::-1]
        else:
            task_answer_ref = task_info
    elif task_str == 'A4':
        # For counting tasks
        start_num = int(task_info_split[0])
        count_num = int(task_info_split[1])
        ans_num = start_num + count_num

        # Ignore invalid task options
        if ans_num > 9:
            task_str = 'INVALID'
            warn('A4: Computed answer > 9')

        task_answer_ref = np.array([str(ans_num)])
    elif task_str == 'A5':
        # QA task
        num_list = map(int, list(task_info_split[0]))
        probe_num = int(task_info_split[1])

        if has_P:
            task_answer_ref = np.array([str(num_list[probe_num - 1])])
        elif has_K:
            task_answer_ref = np.array([str(num_list.index(probe_num) + 1)])
        else:
            task_str = 'INVALID'
            warn('A5: No valid P/K for QA task')
    elif task_str == 'A6':
        from sets import Set
        # RVC task
        if len(task_info_split) % 2:
            match_list = None
            for i in range(len(task_info_split) / 2):
                list1 = np.array(list(task_info_split[i * 2]))
                list2 = np.array(list(task_info_split[i * 2 + 1]))
                if match_list is None:
                    match_list = [
                        Set(np.where(list1 == item)[0]) for item in list2
                    ]
                else:
                    # TODO: Check for inconsistencies across pairs
                    if len(list2) != len(match_list):
                        warn('A6: Inconsistent RVC ref answer lengths.')
                        task_str = 'INVALID'
                    else:
                        match_list = [
                            match_list[j]
                            & Set(np.where(list1 == list2[j])[0])
                            for j in range(len(match_list))
                        ]
            list1 = np.array(list(task_info_split[-1]))
            task_answer_ref = np.array(
                [list1[list(set_list)[0]] for set_list in match_list])
        else:
            task_str = 'INVALID'
            warn('A6: Invalid RVC task. No question list given.')
    elif task_str == 'A7':
        # Raven's induction task
        # Induction task comes in 3 forms: changing list len, and changing
        #                                  number relations, identical lists
        col_count = 1
        induction_diff = None
        induction_len_change = None
        induction_identity = None

        for i in range(1, len(task_info_split)):
            if col_count % 3 == 0:
                col_count += 1
                continue
            list1 = map(int, np.array(list(task_info_split[i - 1])))
            list2 = map(int, np.array(list(task_info_split[i])))

            # Handle the following cases:
            # 1. Unchanging list lengths of len 1
            if len(list1) == len(list2) == 1:
                diff = list2[0] - list1[0]
                if induction_diff is None:
                    induction_diff = diff
                if induction_diff != diff:
                    warn('A7: Inconsistent change between induction items')
                    task_str = 'INVALID'
            # 2. Changing list lengths, but containing identical items
            elif (list1[0] == list2[0]) and (len(list1) != len(list2)):
                len_change = len(list2) - len(list1)
                if induction_len_change is None:
                    induction_len_change = len_change
                if induction_len_change != len_change:
                    warn('A7: Inconsistent change between list lenghts')
                    task_str = 'INVALID'
            elif (len(list1) == len(list2)) and (list1 == list2):
                induction_identity = True
            else:
                warn('A7: Unhandled induction task type')
                task_str = 'INVALID'

            # Handle transition to next row
            col_count += 1

        def spaun_response_to_int(c):
            return int(c) if c.isdigit() else -1

        list1 = map(spaun_response_to_int, list(task_info_split[-1]))
        if induction_diff is not None and induction_len_change is None and \
           induction_identity is None:
            task_answer_ref = np.array(map(str, [list1[0] + induction_diff]))
        elif (induction_len_change is not None and induction_diff is None
              and induction_identity is None):
            task_answer_ref = np.array(
                map(str, [list1[0]] * (len(list1) + len_change)))
        elif (induction_len_change is None and induction_diff is None
              and induction_identity is not None):
            task_answer_ref = np.array(map(str, list1))
        else:
            warn('A7: Multiple induction types encountered?')
            task_str = 'INVALID'

    # Format the task answer list (make the same length as the reference
    # answer list). Applies to all but learning task
    if task_str == 'INVALID':
        return task_str, np.array([0])

    if task_str in ['A0', 'A1', 'A3', 'A4', 'A5', 'A6', 'A7']:
        task_answer = np.chararray(task_answer_ref.shape)
        task_answer[:] = ''
        task_answer_len = min(len(task_answer_ref), len(task_answer_spaun))
        task_answer[:task_answer_len] = task_answer_spaun[:task_answer_len]

        # DEBUG
        # print task_data_str, task_answer, task_answer_ref
    else:
        print task_data_str

    if task_str in ['A0', 'A1', 'A3']:
        # For memory, recognition, copy drawing tasks, check recall accuracy
        # per item
        return ('_'.join([task_str, str(len(task_answer_ref))]),
                map(int, task_answer == task_answer_ref))

    if task_str in ['A4', 'A5', 'A6', 'A7']:
        # For other non-learning tasks, check accuracy as wholesale correct /
        # incorrect
        if task_answer[0] == '-':
            return (None, None)
        return ('_'.join([task_str, str(len(task_answer_ref))]),
                [int(np.all(task_answer == task_answer_ref))])
Пример #27
0
def read_hyperion_config(file_path):
    """
    Parses hyperion config file.
    """
    with open(file_path) as hyperion_config_json:
        config = commentjson.load(hyperion_config_json)

        leds = []

        x_coords = []
        y_coords = []

        for led in config.get('leds', []):
            hscan = led['hscan']
            vscan = led['vscan']
            hmin = hscan['minimum']
            hmax = hscan['maximum']
            vmin = vscan['minimum']
            vmax = vscan['maximum']
            h_center = round(((hmin + hmax) / 2) * 100, 2)
            v_center = round(((vmin + vmax) / 2) * 100, 2)
            x_coords.append(h_center)
            y_coords.append(v_center)
            leds.append({'x': h_center, 'y': v_center})

        xcounts = []
        left = None
        right = None

        for x in Set(x_coords):
            xcounts.append({'x': x, 'count': x_coords.count(x)})

        if len(dict(
            (xcount['count'], xcount) for xcount in xcounts).values()) > 1:
            # Position might not be minimum for TV setups
            xcounts.sort(key=operator.itemgetter('count'))
            right = xcounts[len(xcounts) - 2]
            left = xcounts[len(xcounts) - 1]
        else:
            # Position should be minimum for matrix setups
            xcounts.sort(key=operator.itemgetter('x'))
            right = xcounts[len(xcounts) - 1]
            left = xcounts[0]

        if right['x'] < left['x']:
            left, right = right, left

        ycounts = []
        top = None
        bottom = None

        for y in Set(y_coords):
            ycounts.append({'y': y, 'count': y_coords.count(y)})

        if len(dict(
            (ycount['count'], ycount) for ycount in ycounts).values()) > 1:
            # Position might not be minimum for TV setups
            ycounts.sort(key=operator.itemgetter('count'))
            bottom = ycounts[len(ycounts) - 2]
            top = ycounts[len(ycounts) - 1]
        else:
            # Position should be minimum for matrix setups
            ycounts.sort(key=operator.itemgetter('y'))
            bottom = ycounts[len(ycounts) - 1]
            top = ycounts[0]

        if bottom['y'] < top['y']:
            top, bottom = bottom, top

        leds_left = []
        leds_right = []
        leds_top = []
        leds_bottom = []

        for i, led in enumerate(leds):
            x = led['x']
            y = led['y']
            if x == left['x']:
                leds_left.append(i)
            elif x == right['x']:
                leds_right.append(i)
            elif y == top['y']:
                leds_top.append(i)
            elif y == bottom['y']:
                leds_bottom.append(i)

        # Sort the lists
        leds_top.sort(key=lambda i: leds[i]['x'], reverse=False)
        leds_right.sort(key=lambda i: leds[i]['y'], reverse=False)
        leds_bottom.sort(key=lambda i: leds[i]['x'], reverse=True)
        leds_left.sort(key=lambda i: leds[i]['y'], reverse=True)

        # Not the lists run like this:

        #  >>>>>>> TOP >>>>>>>
        #  ^                 v
        #  ^                 v
        # LEFT              RIGHT
        #  ^                 v
        #  ^                 v
        #  <<<<< BOTTOM <<<<<<

        # print 'leds_top: {}'.format(leds_top)
        # print 'leds_right: {}'.format(leds_right)
        # print 'leds_bottom: {}'.format(leds_bottom)
        # print 'leds_left: {}'.format(leds_left)

        return (leds, leds_top, leds_right, leds_bottom, leds_left)
Пример #28
0
effectivenessMap = {
    -2 : 0.51,
    -1 : 0.714,
    0 : 1,
    1 : 1.4
}

attributeNum = Set([
    "height",
    "weight",
    "number",
    "maxcp",
    "attack",
    "defense",
    "stamina",
    "damage",
    "energy",
    "energy gain",
    "dps",
    "eps",
    "cooldown",
    "activation",
    "bars"
])

def parsePokemon(attributes, poke):
    global pokemon
    current = {}
    delta = 0
    for i in range(0, len(attributes)):
        attr = attributes[i]
def get_routes():
    routes = None
    valid_trips = None
    n = 1
    try:
        n = int(request.args.get('next', 1))
    except ValueError:
        return jsonify({ '404' : 'Cannot parse \'next\' parameter'}), 404
    if len(request.args.keys()) > 0:
        # filter routes by the provided URL parameters
        lat1 = request.args.get('lat1', 999)
        lon1 = request.args.get('lon1', 999)
        lat2 = request.args.get('lat2', 999)
        lon2 = request.args.get('lon2', 999)
        if lat1 == 999 or lon1 == 999 or lat2 == 999 or lon2 == 999:
            # the parameters provided cannot be used to filter, so return error
            return jsonify({ '404' : 'Bad URL Parameters'}), 404
        else:
            stop_times = []
            start = decode(request.args.get('start', ''))
            stop = decode(request.args.get('stop', ''))
            
            if len(stop) > 0 and len(start) == 0:
                # the parameters provided cannot be used to filter, so return error
                return jsonify({ '404' : 'Cannot have end time without start time'}), 404
            elif len(start) == 0 and len(stop) == 0:
                # filter by latitude and longitude only
                stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2)
            else:
                start_time = None
                stop_time = None
                try:
                    start_time = gtfs_parser.datetime_from_string(start)
                    if len(stop) > 0:
                        stop_time = gtfs_parser.datetime_from_string(stop)
                except:
                    return jsonify({ '404' : 'Cannot parse time'}), 404
                if not stop_time is None:
                    # filter within a range of time
                    stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2, models.StopTime.arrival_time >= start_time, models.StopTime.departure_time <= stop_time)
                else:
                    # filter from initial time only
                    stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2, models.StopTime.arrival_time >= start_time)
                
            stop_times = array_from_query(stop_times)
            stop_times.sort(key = lambda st: st.arrival_time, reverse = False)
            trips = []
            for stop_time in stop_times:
                trips.append(stop_time.trip)
            trips = unique_array(trips)
            
            filtered_routes = Set()
            for trip in trips:
                filtered_routes.add(trip.route)
            routes = filtered_routes
            valid_trips = trips
    else:
        # otherwise, no URL parameters are provided, so return all routes
        routes = models.Route.query.all()
        
    return jsonify({ 'routes' : [r.serialize(valid_trips, n) for r in routes] })
Пример #30
0
    def initialize(self, opt):
        BaseModel.initialize(self, opt)
        if opt.resize_or_crop != 'none' or not opt.isTrain:  # when training at full res this causes OOM
            torch.backends.cudnn.benchmark = True
        self.isTrain = opt.isTrain
        self.use_features = opt.instance_feat or opt.label_feat
        self.gen_features = self.use_features and not self.opt.load_features
        input_nc = opt.label_nc if opt.label_nc != 0 else opt.input_nc

        ##### define networks
        # Generator network
        netG_input_nc = input_nc + opt.otherInfo_nc
        if not opt.no_instance:
            netG_input_nc += 1
        if self.use_features:
            netG_input_nc += opt.feat_num
        self.netG = networks.define_G(netG_input_nc,
                                      opt.output_nc,
                                      opt.ngf,
                                      opt.netG,
                                      opt.n_downsample_global,
                                      opt.n_blocks_global,
                                      opt.n_local_enhancers,
                                      opt.n_blocks_local,
                                      opt.norm,
                                      gpu_ids=self.gpu_ids)

        # Discriminator network
        if self.isTrain:
            use_sigmoid = opt.no_lsgan
            netD_input_nc = input_nc + opt.output_nc
            if not opt.no_instance:
                netD_input_nc += 1
            self.netD = networks.define_D(netD_input_nc,
                                          opt.ndf,
                                          opt.n_layers_D,
                                          opt.norm,
                                          use_sigmoid,
                                          opt.num_D,
                                          not opt.no_ganFeat_loss,
                                          gpu_ids=self.gpu_ids)

        ### Encoder network
        if self.gen_features:
            self.netE = networks.define_G(opt.output_nc,
                                          opt.feat_num,
                                          opt.nef,
                                          'encoder',
                                          opt.n_downsample_E,
                                          norm=opt.norm,
                                          gpu_ids=self.gpu_ids)
        if self.opt.verbose:
            print('---------- Networks initialized -------------')

        # Preprocessor network
        self.netP = networks.define_P(opt.otherInfoTotalSize, opt.otherInfo_nc)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.netP.to(device)

        # load networks
        if not self.isTrain or opt.continue_train or opt.load_pretrain:
            pretrained_path = '' if not self.isTrain else opt.load_pretrain
            self.load_network(self.netG, 'G', opt.which_epoch, pretrained_path)
            self.load_network(self.netP, 'P', opt.which_epoch, pretrained_path)
            if self.isTrain:
                self.load_network(self.netD, 'D', opt.which_epoch,
                                  pretrained_path)
            if self.gen_features:
                self.load_network(self.netE, 'E', opt.which_epoch,
                                  pretrained_path)

        # set loss functions and optimizers
        if self.isTrain:
            if opt.pool_size > 0 and (len(self.gpu_ids)) > 1:
                raise NotImplementedError(
                    "Fake Pool Not Implemented for MultiGPU")
            self.fake_pool = ImagePool(opt.pool_size)
            self.old_lr = opt.lr

            # define loss functions
            self.loss_filter = self.init_loss_filter(not opt.no_ganFeat_loss,
                                                     not opt.no_vgg_loss,
                                                     not opt.no_smooth_loss,
                                                     not opt.no_nonzero_loss)

            self.criterionGAN = networks.GANLoss(use_lsgan=not opt.no_lsgan,
                                                 tensor=self.Tensor)
            self.criterionFeat = torch.nn.L1Loss()
            if not opt.no_vgg_loss:
                self.criterionVGG = networks.VGGLoss(self.gpu_ids)

            if not opt.no_smooth_loss:
                self.criterionSmooth = networks.SmoothLoss(self.gpu_ids)

            if not opt.no_nonzero_loss:
                self.criterionNonzero = networks.NonzeroLoss(self.gpu_ids)

            # Names so we can breakout loss
            self.loss_names = self.loss_filter('G_GAN', 'G_GAN_Feat', 'G_VGG',
                                               'G_smooth', 'G_nonzero',
                                               'D_real', 'D_fake')

            # initialize optimizers
            # optimizer G
            if opt.niter_fix_global > 0:
                import sys
                if sys.version_info >= (3, 0):
                    finetune_list = set()
                else:
                    from sets import Set
                    finetune_list = Set()

                params_dict = dict(self.netG.named_parameters())
                params = []
                for key, value in params_dict.items():
                    if key.startswith('model' + str(opt.n_local_enhancers)):
                        params += [value]
                        finetune_list.add(key.split('.')[0])
                print(
                    '------------- Only training the local enhancer network (for %d epochs) ------------'
                    % opt.niter_fix_global)
                print('The layers that are finetuned are ',
                      sorted(finetune_list))
            else:
                params = list(self.netG.parameters())
            if self.gen_features:
                params += list(self.netE.parameters())
            self.optimizer_G = torch.optim.Adam(params,
                                                lr=opt.lr,
                                                betas=(opt.beta1, 0.999))

            # optimizer D
            params = list(self.netD.parameters())
            self.optimizer_D = torch.optim.Adam(params,
                                                lr=opt.lr,
                                                betas=(opt.beta1, 0.999))