#!/usr/bin/env python import sys from sets import Set true_dups = Set() reported_dups = Set() if len(sys.argv) < 3: print "Usage: python check.py reported_duplicates true_duplicates" exit(-1) reported_duplicates_file = sys.argv[1] true_duplicates_file = sys.argv[2] with open(true_duplicates_file, "r") as inf: for line in inf: true_dups.add(line.strip()) with open(reported_duplicates_file, "r") as inf: for line in inf: reported_dups.add(line.strip()) tp, fp, fn = [0, 0, 0] for pair in reported_dups: if pair in true_dups: tp += 1 else: fp += 1
#!/opt/bb/bin/python import sys from sets import Set program = {} counts = Set() for line in sys.stdin: line = line.rstrip('\n').split(")") name = line[0].split()[0] weight = int(line[0].split("(")[1]) words = line[1].split(", ") if words == ['']: words = [] else: words[0] = words[0][4:] program[name] = {'weight': weight, 'words': words, 'total': 0} for w in words: counts.add(w) base = "" for p in program.keys(): if p not in counts: base = p break print base problem = 0 def getWeight(name): p = program[name]
def core3(swLng, swLat, neLng, neLat, selectUid, rawkeywords, stime, etime): startTime = timer.time() timelist = [] keywords = rawkeywords.split(',') def KM(k, p): category = POIDict[p]['category'] if k == category: tf = 1 else: return 0 idf = FBCategoryPOICountDict[k] AT = FB_ATDict[k] score = (tf / float(idf)) * AT return score def cal_re_in(select_rid, tuples): Total_PATS = 0.0 Total_Timescore = 0.0 Total_SocialINF = 0.0 Total_KM = 0.0 AllPOI = [] if len(tuples) <= POILength: return for t in tuples: Total_PATS += t[2] Total_Timescore += t[3] Total_SocialINF += t[4] pid = t[0] time = t[1] lat = POIDict[pid]['latitude'] lng = POIDict[pid]['longitude'] name = POIDict[pid]['name'] category = POIDict[pid]['category'] link = POIDict[pid]['link'] likes = POIDict[pid]['likes'] checkins = POIDict[pid]['checkins'] for keyword in keywords: Total_KM += KM(keyword, pid) POITuple = { 'pid': pid, 'time': time, 'coor': [lat, lng], 'name': name, 'category': category, 'link': link, 'likes': likes, 'checkins': checkins } # POITuple = { # 'pid': pid, 'time': time, 'coor': [lat, lng], 'name': name} AllPOI.append(POITuple) rScore = 0.0 rScore = Total_PATS + Total_Timescore + Total_SocialINF POICount = len(tuples) cover = 0.0 avg_rScore = float(rScore) / POICount reconstructionFlag = True tup = uid, orignal_rid, select_rid, rScore, avg_rScore, 0, 0, 0, 0, 0, 0, 0, Total_PATS, Total_Timescore, Total_SocialINF, float( Total_PATS) / POICount, float(Total_Timescore) / POICount, float( Total_SocialINF) / POICount, 0, 0, AllPOI, Total_KM return tup def getMinAndMax(lats, longs): return str(min(lats)), str(min(longs)), str(max(lats)), str(max(longs)) def splitIntoHeadDict(route): length = len(route) for i in xrange(length): if i == length - 1: break headID = route[i][0] tailID = route[i + 1][0] head = POIDataDict[headID] tail = POIDataDict[tailID] if head[0] == tail[0] or head[1] >= tail[1]: continue if head[0] not in splitDict: splitDict[head[0]] = Set([]) splitDict[head[0]].add((head, tail)) else: splitDict[head[0]].add((head, tail)) def construct(pairSet): endflag = False global construct_c global reconstructionIdx for i in pairSet: if len(tempStack) == 0: tempStack.append(i[0]) tempStack.append(i[1]) else: idx = len(tempStack) - 1 if i[1][1] > tempStack[idx][1] and i[1][0] != tempStack[idx][0]: tempStack.append(i[0]) tempStack.append(i[1]) else: tempStack.append(i[0]) endflag = True headSet = splitDict.get(i[0][0], None) tailSet = splitDict.get(i[1][0], None) if tailSet != None and endflag == False: tempStack.pop() construct(tailSet) if len(tempStack) <= 1: pass else: x_p = [] tempStack_cpy = [] score = 0.0 for i in tempStack: x_p.append(i[0]) tempStack_cpy.append(i) score += (i[2] + i[3] + i[4]) if tuple(x_p) not in prefixSet: #make prefix prefix = [] length = len(x_p) for i in xrange(length): if i < length - 1: prefix.append(x_p[i]) prefixSet.add(tuple(prefix)) if tuple(x_p) not in reconstructionInput_p: if tuple(x_p) in orignalRouteScore: if score > orignalRouteScore[tuple(x_p)]: r_rid = 'Reconstruct_' + str( reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) else: r_rid = 'Reconstruct_' + str(reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) tempStack.pop() elif endflag == True: if len(tempStack) == 1: pass else: x_p = [] tempStack_cpy = [] score = 0.0 for i in tempStack: x_p.append(i[0]) tempStack_cpy.append(i) score += (i[2] + i[3] + i[4]) if tuple(x_p) not in prefixSet: #make prefix prefix = [] length = len(x_p) for i in xrange(length): if i < length - 1: prefix.append(x_p[i]) prefixSet.add(tuple(prefix)) if tuple(x_p) not in reconstructionInput_p: if tuple(x_p) in orignalRouteScore: if score > orignalRouteScore[tuple(x_p)]: r_rid = 'Reconstruct_' + str( reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) else: r_rid = 'Reconstruct_' + str(reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) else: if len(tempStack) == 1: pass else: x_p = [] tempStack_cpy = [] score = 0.0 for i in tempStack: x_p.append(i[0]) tempStack_cpy.append(i) score += (i[2] + i[3] + i[4]) if tuple(x_p) not in prefixSet: #make prefix prefix = [] length = len(x_p) for i in xrange(length): if i < length - 1: prefix.append(x_p[i]) prefixSet.add(tuple(prefix)) if tuple(x_p) not in reconstructionInput_p: if tuple(x_p) in orignalRouteScore: if score > orignalRouteScore[tuple(x_p)]: r_rid = 'Reconstruct_' + str( reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) else: r_rid = 'Reconstruct_' + str(reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add(tuple(x_p)) for i in xrange(2): tempStack.pop() if len(tempStack) <= 1: pass else: x_p = [] tempStack_cpy = [] score = 0.0 for i in tempStack: x_p.append(i[0]) tempStack_cpy.append(i) score += (i[2] + i[3] + i[4]) if tuple(x_p) not in prefixSet: #make prefix prefix = [] length = len(x_p) for i in xrange(length): if i < length - 1: prefix.append(x_p[i]) prefixSet.add(tuple(prefix)) if tuple(x_p) not in reconstructionInput_p: if tuple(x_p) in orignalRouteScore: if score > orignalRouteScore[tuple( x_p)]: r_rid = 'Reconstruct_' + \ str(reconstructionIdx) reconstructionOutput.append( cal_re_in( r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add( tuple(x_p)) else: r_rid = 'Reconstruct_' + \ str(reconstructionIdx) reconstructionOutput.append( cal_re_in(r_rid, tempStack)) reconstructionIdx += 1 reconstructionInput_p.add( tuple(x_p)) # def if_dominate(check, test): # if check == test: # return True # for i in xrange(len(check)): # if check[i] > test[i]: # return True # return False # def cal_dominate(input): # check = input[0] # inputd = input[1] # all_dominate = True # for test in inputd: # if if_dominate(check[1], test[1]) == False: # all_dominate = False # return None # return check pro = 0.1 POILength = 2 conn_string = "host='192.168.100.200' dbname='moonorblue' user='******' password='******'" conn = psycopg2.connect(conn_string) cur = conn.cursor() qByRegion = "SELECT poi,rid FROM fb_route WHERE geom && st_makeenvelope(" + str( swLng ) + "," + str(swLat) + "," + str(neLng) + "," + str( neLat ) + ",4326) AND st_area(geom) != 0 AND (st_area(st_intersection(geom,st_makeenvelope(" + str( swLng) + "," + str(swLat) + "," + str(neLng) + "," + str( neLat) + ",4326)))) != 0;" cur.execute(qByRegion) qByRegion_rows = [r for r in cur] timelist.append('Query:' + str(timer.time() - startTime)) startTime = timer.time() orignalCategory = Set([]) orignalPOI = Set([]) minlong = float(swLng) minlat = float(swLat) maxlong = float(neLng) maxlat = float(neLat) uid = str(selectUid) orignal_rid = 0 fids = RelationDict.get(uid, []) fids = Set(fids) reconstruction_start = timer.time() splitDict = {} reconstructionInput = [] reconstructionOutput = [] reconstructionOutputSet = Set([]) global reconstructionIdx reconstructionIdx = 0 scoreD = {} POIScoreDict = {} POIDataDict = {} c = 0 qCount = 0 for r in qByRegion_rows: # break if too much result if qCount > 10000: break POIs = eval(r[0]) rid = r[1] localInput = [] score = 0.0 for POI in POIs: pid = POI['pid'] latitude = POIDict[pid]['latitude'] longitude = POIDict[pid]['longitude'] if latitude >= minlat and latitude <= maxlat and longitude >= minlong and longitude <= maxlong: PATS = POI['PATS'] timescore = POI['timeScore'] socialINF = 0.0 KMs = 0.0 time = int( datetime.datetime.fromtimestamp(float( POI['time'])).strftime('%H')) + 8 if time > 24: time = time - 24 category = POIDict[pid]['category'] visiters = Set(POIDict[pid]['visiters']) # select social influnce score for v in visiters: if str(v) not in fids: continue new_u = FBsInfIdxDict[uid] new_f = FBsInfIdxDict[v] scores = FBsInfMatrix[new_u][new_f] socialINF += float(scores) for keyword in keywords: KMs += KM(keyword, pid) score += (PATS + timescore + socialINF + KMs) localInput.append( (pid, time, PATS, timescore, socialINF, category)) if pid not in POIScoreDict: POIScoreDict[pid] = PATS + timescore + socialINF + KMs POIDataDict[pid] = (pid, time, PATS, timescore, socialINF, category) else: if (PATS + timescore + socialINF + KMs) > POIScoreDict[pid]: POIDataDict[pid] = (pid, time, PATS, timescore, socialINF, category) else: continue scoreD[c] = score c += 1 qCount += 1 reconstructionInput.append(tuple(localInput)) timelist.append('POI:' + str(timer.time() - startTime)) startTime = timer.time() sorted_scoreD = sorted(scoreD.items(), key=operator.itemgetter(1), reverse=True) limit = pro * len(sorted_scoreD) chosedInput = [] reconstructionInput_p = [] orignalRouteScore = {} for i in xrange(int(limit)): chosedInput.append(sorted_scoreD[i][0]) for i in chosedInput: splitIntoHeadDict(reconstructionInput[i]) t = [] score = 0.0 for x in reconstructionInput[i]: t.append(x[0]) score += (x[2] + x[3] + x[4]) orignalRouteScore[tuple(t)] = score reconstructionInput_p = Set(reconstructionInput_p) prefixSet = Set() for i in splitDict: tempStack = [] construct_c = 0 construct(splitDict[i]) timelist.append('Construct:' + str(timer.time() - startTime)) startTime = timer.time() routeList = [] result = [] qCoverTime = 0.0 ScoringTime = 0.0 ProcessTime = 0.0 qCount = 0 for row_r in qByRegion_rows: # break if too much result if qCount > 10000: break process_start_time = timer.time() select_rid = row_r[1] if orignal_rid == select_rid: continue cover = 0 POIs = eval(row_r[0]) if len(POIs) <= POILength: continue rScore = 0.0 recommendCategory = Set([]) socialFlag = False POICount = len(POIs) recommendPOI = Set([]) scoring_start_time = timer.time() Total_PATS = 0.0 Total_Timescore = 0.0 Total_SocialINF = 0.0 Total_KM = 0.0 AllPOI = [] for POI in POIs: pid = POI['pid'] PATS = POI['PATS'] timescore = POI['timeScore'] time = int( datetime.datetime.fromtimestamp(float( POI['time'])).strftime('%H')) + 8 if time > 24: time = time - 24 socialINF = 0.0 latitude = POIDict[pid]['latitude'] longitude = POIDict[pid]['longitude'] if latitude >= minlat and latitude <= maxlat and longitude >= minlong and longitude <= maxlong: visiters = Set(POIDict[pid]['visiters']) # select social influnce score for v in visiters: if str(v) not in fids: continue new_u = FBsInfIdxDict[uid] new_f = FBsInfIdxDict[v] scores = FBsInfMatrix[new_u][new_f] socialFlag = True socialINF += float(scores) Total_PATS += PATS Total_Timescore += timescore Total_SocialINF += socialINF for keyword in keywords: Total_KM += KM(keyword, pid) pScore = PATS + timescore + socialINF rScore += pScore name = POIDict[pid]['name'] category = POIDict[pid]['category'] link = POIDict[pid]['link'] likes = POIDict[pid]['likes'] checkins = POIDict[pid]['checkins'] POITuple = { 'pid': pid, 'time': time, 'coor': [latitude, longitude], 'name': name, 'category': category, 'link': link, 'likes': likes, 'checkins': checkins } # POITuple = {'pid': pid, 'time': time , 'coor': [latitude, longitude], 'name': name} AllPOI.append(POITuple) avg_rScore = float(rScore) / POICount poi_hitCount = 0 poiHit = 0 editdistance = 0 hitCount = 0 categoryHit = 0 ScoringTime = 0 ProcessTime = 0 reconstructionFlag = False tup = uid, orignal_rid, select_rid, rScore, avg_rScore, categoryHit, cover, poi_hitCount, poiHit, socialFlag, ScoringTime, ProcessTime, Total_PATS, Total_Timescore, Total_SocialINF, float( Total_PATS) / POICount, float(Total_Timescore) / POICount, float( Total_SocialINF ) / POICount, editdistance, reconstructionFlag, AllPOI, Total_KM result.append(tup) qCount += 1 result += reconstructionOutput timelist.append('Scoring:' + str(timer.time() - startTime)) startTime = timer.time() result = [d for d in result if d is not None] # pool_size = 8 # your "parallelness" # _pool = Pool(pool_size) # xxx = _pool.map(cal,qByRegion_rows) #remove subsequence # POISequence = [] # for i in result: # if i is not None: # POIs = i[20] # Seq = '' # for POI in POIs: # Seq += POI['pid'] # Seq += ',' # POISequence.append(Seq) # seqIdx = 0 # for seq in POISequence: # for seqq in POISequence: # if seq == seqq: # continue # if seq in seqq: # result[seqIdx] = None # break # seqIdx += 1 # timelist.append('Remove subseq:'+str(timer.time()-startTime)) # startTime = timer.time() result_new = [] #time constraint here!!! if stime == 'Anytime' and etime == 'Anytime': for i in result: if i is not None: POIs = i[20] nTuple = i + (POIs, ) result_new.append(nTuple) pass elif stime == 'Anytime': endTime = int(etime.replace(':00', '')) for i in result: if i is not None: POIs = i[20] newPOIs = [] for POI in POIs: time = POI['time'] if int(time) <= endTime: newPOIs.append(POI) if len(newPOIs) > 1: nTuple = i + (newPOIs, ) result_new.append(nTuple) elif etime == 'Anytime': startTime = int(stime.replace(':00', '')) for i in result: if i is not None: POIs = i[20] newPOIs = [] for POI in POIs: time = POI['time'] if int(time) >= startTime: newPOIs.append(POI) if len(newPOIs) > 1: nTuple = i + (newPOIs, ) result_new.append(nTuple) else: startTime = int(stime.replace(':00', '')) endTime = int(etime.replace(':00', '')) for i in result: if i is not None: POIs = i[20] newPOIs = [] for POI in POIs: time = POI['time'] if int(time) >= startTime and int(time) <= endTime: newPOIs.append(POI) if len(newPOIs) > 1: nTuple = i + (newPOIs, ) result_new.append(nTuple) timelist.append('Time:' + str(timer.time() - startTime)) startTime = timer.time() skylineInputDict = {} skylineInputValue = [] skylineInputDict_avg = {} skylineInputValue_avg = [] idxCount = 0 for t in result_new: rid = t[2] ScoringTime += t[10] ProcessTime += t[11] # skylineT = (t[12], t[13], t[14]) skylineT_avg = (t[15], t[16], t[17], t[21]) # skylineInputDict[(rid, skylineT)] = idxCount skylineInputDict_avg[(rid, skylineT_avg)] = idxCount # skylineInputValue.append((rid, skylineT)) skylineInputValue_avg.append((rid, skylineT_avg)) idxCount += 1 # _pool = Pool(8) r = _pool.map( cal_dominate, itertools.izip(skylineInputValue_avg, itertools.repeat(skylineInputValue_avg))) # r = [i for i in r if i is not None] resultData_avg = [ result_new[skylineInputDict_avg[i]] for i in r if i is not None ] # pool_skyline.close() # pool_skyline.join() # r = [] # for i in skylineInputValue_avg: # all_dominate = True # for j in skylineInputValue_avg: # if if_dominate(i[1], j[1]) == False: # all_dominate = False # break # if all_dominate: # r.append(i) # resultData_avg = [result_new[skylineInputDict_avg[i]] for i in r] timelist.append('Skyline:' + str(timer.time() - startTime)) startTime = timer.time() startTime = timer.time() #sorting by PATS sorted_by_PATS = sorted(result_new, reverse=True, key=lambda tup: tup[12])[:len(resultData_avg)] #sorting by timescore sorted_by_timescore = sorted(result_new, reverse=True, key=lambda tup: tup[13])[:len(resultData_avg)] #sorting by socialINF sorted_by_socialINF = sorted(result_new, reverse=True, key=lambda tup: tup[14])[:len(resultData_avg)] #sorting by KM sorted_by_KM = sorted(result_new, reverse=True, key=lambda tup: tup[21])[:len(resultData_avg)] resultPOI_skyline = [i[22] for i in resultData_avg] resultPOI_skyline_ori = [ i[22] for i in resultData_avg if 'Reconstruct_' not in str(i[2]) ] resultPOI_skyline_re = [ i[22] for i in resultData_avg if 'Reconstruct_' in str(i[2]) ] resultPOI_PATS = [i[22] for i in sorted_by_PATS] resultPOI_timescore = [i[22] for i in sorted_by_timescore] resultPOI_socialINF = [i[22] for i in sorted_by_socialINF] resultPOI_KM = [i[22] for i in sorted_by_KM] timelist.append('Sort:' + str(timer.time() - startTime)) startTime = timer.time() return resultPOI_skyline, resultPOI_PATS, resultPOI_timescore, resultPOI_socialINF, resultPOI_skyline_ori, resultPOI_skyline_re, resultPOI_KM
def compareIndividualsNodeWise(truthList, testList, model1s, model2s, covs, equivs): modeler = model1s[0] SDs = [0. for q in truthList] nodeSDs = [] edgeSens, inDegrees, edgePPVs = [], [], [] inCoV = [] TPsum, TNsum, FPsum, FNsum = 0, 0, 0, 0 for node in range(0, len(modeler.nodeList)): tempSD = 0. FP, TP, TN, FN = 0, 0, 0, 0 # simplify rules at the node and find the edge-wise PPV, sens, and SDs inCovTemper = [] for k in range(len(truthList)): inCovtemp = [] # find start and end of this node in each model start1, end1 = findEnds(model1s[k], node, truthList[k]) start2, end2 = findEnds(model2s[k], node, testList[k]) # find the shadow and nodes for each model truthInEdges = findInEdges(model1s[k], node) testInEdges = findInEdges(model2s[k], node) # find the bitstring for just this node truth = truthList[k][start1:end1] test = testList[k][start2:end2] # simplify ground truth and recovered rules truth = simplifyRule(truth, truthInEdges) test = simplifyRule(test, testInEdges) # edit overall rule list with simplified rules testList[k][start2:end2] = test truthList[k][start1:end1] = truth # find SD, PPV, etc.... truthSet = Set([]) # edges in correct rule testSet = Set([]) # edges in rule found baseSet = Set([]) # edges possible across all rules # find edges in true rule (and edges possible), average incoming coefficient of variation for i in range(0, len(truth)): if truth[i] == 1: for nodeToAdd in model1s[k].andNodeList[node][i]: truthSet.add(nodeToAdd) inCovtemp.append(covs[k][node]) for nodeToAdd in model1s[k].andNodeList[node][i]: baseSet.add(nodeToAdd) # find edges in test (recovered) rule for i in range(0, len(test)): if test[i] == 1: for nodeToAdd in model2s[k].andNodeList[node][i]: testSet.add(nodeToAdd) # find structural distance at this node. SDs[k] = SDs[k] + len(truthSet.difference(testSet)) + len( testSet.difference(truthSet)) tempSD = tempSD + len(truthSet.difference(testSet)) + len( testSet.difference(truthSet)) # save edge-wise statistics for this node FP += 1. * len(testSet.difference(truthSet)) TP += 1. * len(testSet.intersection(truthSet)) FN += 1. * len(truthSet.difference(testSet)) inCovTemper.append(numpy.mean(inCovtemp)) # calculae and save overall edge-wise statistics if (TP + FN) > 0: sensitivity = 1. * TP / (TP + FN) else: sensitivity = 100 if TP + FP > 0: PPV = 1. * TP / (TP + FP) else: PPV = 100 nodeSDs.append(tempSD / len(truthList)) edgeSens.append(sensitivity) edgePPVs.append(PPV) TPsum += TP FNsum += FN FPsum += FP inDegrees.append(len(baseSet)) inCoV.append(numpy.mean(inCovTemper)) if (TPsum + FNsum) > 0: edgeSens = 1. * TPsum / (TPsum + FNsum) else: edgeSens = 100 if (FPsum + TPsum) > 0: edgePPV = 1. * TPsum / (FPsum + TPsum) else: edgePPV = 100 nodeSens = [] # sensitivity by node nodePPV = [] # PPV by node nodeRTs = [] # rules true by node nodePsens = [] nodepPPV = [] nodelister = model1s[ 0].nodeList # gives a node List (should be the same across all trials in a network...) sampleRTs = [[] for item in truthList] # Rules True for each trial samplePPVs = [[] for item in truthList] # PPV for each trial sampleSenss = [[] for item in truthList] # Sens for each trial equivRTsens = [[] for item in truthList ] # RT sensitivity of equivalents for each trial equivSens = [[] for item in truthList ] # sensitivity for equivalents for each trial equivNodeRTsens = [] equivNodeSens = [] # iterate over all nodes in the network for node in range(len(nodelister)): rtTemp = [] # stores rules true for this node across all networks ones = [] # stores the number of false negative and rules zeros = [] # stores the number of correct and rules negones = [] # stores the number of false positive and rules equivOnes = [ ] # stores the min number of false negatives across equivs equivZeros = [] # stores the max correct across equivs equivNegOnes = [] # stores the min false positives across equivs sumindividual = [] # total number true positive and rules equivRTsensNode = [] equivSensNode = [] #loop over individuals provided and calculate sens, PPV, rules true for i in range(len(truthList)): # find start and end of this node in each model start1, end1 = findEnds(model1s[i], node, truthList[i]) start2, end2 = findEnds(model2s[i], node, testList[i]) # find the values for just this node truth = truthList[i][start1:end1] test = testList[i][start2:end2] # set up empty lists for ands, edges, and the shadow and nodes associated with this node in each model truthAnds = [] testAnds = [] # get the set of all shadow and nodes that are actually used in each rule for j in range(len(model1s[i].andNodeList[node])): if truth[j] > 0: truthAnds.append(tuple(model1s[i].andNodeList[node][j])) for j in range(len(model2s[i].andNodeList[node])): if test[j] > 0: testAnds.append(tuple(model2s[i].andNodeList[node][j])) truthAnd = tuple(truthAnds) truthAnd = set(truthAnd) testAnd = set(tuple(testAnds)) # get the set of all shadow and nodes used in each equivalent rule equivAnds = [] # print(equivs[i]) for test1 in equivs[i][node]: tempEquivAnd = [] for j in range(len(model2s[i].andNodeList[node])): if test1[j] > 0: tempEquivAnd.append( tuple(model2s[i].andNodeList[node][j])) testAnd1 = set(tuple(tempEquivAnd)) equivAnds.append(testAnd1) RTequiv = 0. possibilityOnes = [] possibilityZeros = [] possibilityZNetones = [] for testAnder1 in equivAnds: if (truthAnd == testAnder1): RTequiv = 1. possibilityOnes.append(len(truthAnd.difference(testAnd))) possibilityZeros.append(len(truthAnd.intersection(testAnd))) possibilityZNetones.append(len(testAnd.difference(truthAnd))) # append results for this trial to all results maxpossibilityZeros = max(possibilityZeros) minpossiblityOnes = min(possibilityOnes) minpossibilityNegOnes = min(possibilityZNetones) equivOnes.append(minpossiblityOnes) equivZeros.append(maxpossibilityZeros) equivNegOnes.append(minpossibilityNegOnes) equivRTsensNode.append(RTequiv) equivRTsens[i].append(RTequiv) # calculate true positives, false positives, false negatives, and total slots for this node, trial and save onetemp = len(truthAnd.difference(testAnd)) zerotemp = len(truthAnd.intersection(testAnd)) negonetemp = len(testAnd.difference(truthAnd)) sumindtemp = len(truthAnd) ones.append(onetemp) zeros.append(zerotemp) negones.append(negonetemp) sumindividual.append(sumindtemp) # add Rules true first sample-wise then node-wise if len(model1s[i].andNodeList[node]) > 1: if (truthAnd == testAnd): sampleRTs[i].append(1.) else: sampleRTs[i].append(0.) if (sumindtemp - onetemp + negonetemp) > 0: samplePPVs[i].append(1. * (sumindtemp - onetemp) / (sumindtemp - onetemp + negonetemp)) else: samplePPVs[i].append(100) if (sumindividual[i]) > 0: sampleSenss[i].append(1. * (sumindtemp - onetemp) / (sumindtemp)) else: sampleSenss[i].append(100) if (truthAnd == testAnd): rtTemp.append(1.) else: rtTemp.append(0.) nodeRTs.append(numpy.mean(rtTemp)) # node-wise Rules true added equivNodeRTsens.append(numpy.mean(equivRTsensNode)) # calculate sensitivity for the node temp = [ 100 if sumindividual[i] == 0 else 1. * (sumindividual[i] - ones[i]) / (sumindividual[i]) for i in range(0, len(ones)) ] temp = filter(lambda a: a != 100, temp) if len(temp) == 0: sensitivity = 100 else: sensitivity = (1. * numpy.sum(temp) / len(temp)) # calculate max sensitivity for the node temp = [ 100 if sumindividual[i] == 0 else 1. * (sumindividual[i] - equivOnes[i]) / (sumindividual[i]) for i in range(0, len(equivOnes)) ] temp = filter(lambda a: a != 100, temp) if len(temp) == 0: psensitivity = 100 else: psensitivity = (1. * numpy.sum(temp) / len(temp)) nodePsens.append(psensitivity) # calculate PPV for the node temp = [ 100 if (sumindividual[i] - ones[i] + negones[i]) == 0 else 1. * (sumindividual[i] - ones[i]) / (sumindividual[i] - ones[i] + negones[i]) for i in range(0, len(ones)) ] temp = filter(lambda a: a != 100, temp) if len(temp) == 0: PPV = 100 else: PPV = (1. * numpy.sum(temp) / len(temp)) # calculate PPV for the node temp = [ 100 if (sumindividual[i] - equivOnes[i] + equivNegOnes[i]) == 0 else 1. * (sumindividual[i] - equivOnes[i]) / (sumindividual[i] - equivOnes[i] + equivNegOnes[i]) for i in range(0, len(equivOnes)) ] temp = filter(lambda a: a != 100, temp) if len(temp) == 0: pPPV = 100 else: pPPV = (1. * numpy.sum(temp) / len(temp)) nodepPPV.append(pPPV) # add to list of sensitivity and PPV by nodeSens.append(sensitivity) nodePPV.append(PPV) sampleEquivRT = [ 1. * numpy.mean(filter(lambda a: a != 100, sampler)) for sampler in equivRTsens ] # Rules True for each trial sampleRT = [ 1. * numpy.mean(filter(lambda a: a != 100, sampler)) for sampler in sampleRTs ] # Rules True for each trial samplePPV = [ 1. * numpy.mean(filter(lambda a: a != 100, sampler)) for sampler in samplePPVs ] # PPV for each trial sampleSens = [ 1. * numpy.mean(filter(lambda a: a != 100, sampler)) for sampler in sampleSenss ] # Sens for each trial return sampleEquivRT, equivNodeRTsens, nodePsens, nodepPPV, sampleSens, samplePPV, nodeSens, nodePPV, sampleRT, nodeRTs, edgeSens, edgePPV, SDs, nodeSDs, len( modeler.nodeList), inDegrees, inCoV
def write_data_matrix(self, data_matrix, output_fname, strain_id_list, snp_id_list, snp_id2acc, with_header_line, nt_alphabet, strain_id2acc=None, strain_id2category=None, rows_to_be_tossed_out=Set(), strain_id2other_info=None, discard_all_NA_strain=0, predefined_header_row=[ 'strain', 'duplicate', 'latitude', 'longitude', 'nativename', 'stockparent', 'site', 'country' ]): """ 2008-05-08 defunct use write_data_matrix from pymodule 2007-02-19 if strain_id2acc is available, translate strain_id into strain_acc, if strain_id2category is available, add 'category' 2007-02-25 if one strain's SNP row is all NA, it'll be skipped 2007-02-25 add argument rows_to_be_tossed_out 2007-09-23 add discard_all_NA_strain 2007-10-22 add no_of_all_NA_rows 2007-12-13 add predefined_header_row 2007-12-16 add 'duplicate' into predefined_header_row """ sys.stderr.write("Writing data_matrix ...") no_of_all_NA_rows = 0 writer = csv.writer(open(output_fname, 'w'), delimiter='\t') if with_header_line: header_row = [predefined_header_row[0]] if strain_id2category: header_row.append(predefined_header_row[1]) if strain_id2other_info: no_of_fields = len( strain_id2other_info.values()[0]) #2007-12-13 for i in range(no_of_fields): header_row.append(predefined_header_row[2 + i]) for snp_id in snp_id_list: header_row.append(snp_id2acc[snp_id]) writer.writerow(header_row) for i in range(len(data_matrix)): if strain_id2acc: new_row = [strain_id2acc[strain_id_list[i]]] else: new_row = [strain_id_list[i]] if strain_id2category: new_row.append(strain_id2category[strain_id_list[i]]) if strain_id2other_info: new_row += strain_id2other_info[strain_id_list[i]] if discard_all_NA_strain and sum( data_matrix[i] == 0) == data_matrix.shape[1]: no_of_all_NA_rows += 1 continue elif i not in rows_to_be_tossed_out: #2007-02-25 for j in data_matrix[i]: if nt_alphabet: j = number2nt[j] new_row.append(j) writer.writerow(new_row) del writer sys.stderr.write("%s all NA rows ." % no_of_all_NA_rows) sys.stderr.write("Done.\n")
self.href = Href(str(item.uri)) subject = str(model.get_target(item, dc.subject)) self.tags = Set([]) if subject is not None: self.tags = Set([Tag(x) for x in subject.split(" ")]) if tag is not None: self.tags.add(tag) if __name__ == '__main__': import sys from sets import Set username = sys.argv[1] user = User(username) tags = Set() users = {} print("Reading " + username + " posts...") for post in user: for tag in post.tags: tags.add(tag) other_tags = Set() count = 0 for other_post in post.href: u = other_post.user if not u == user: count += 1 if u not in users: users[u] = []
import re from json_utils import load_json import operator import os import pprint from sets import Set from itertools import islice def take(n, iterable): return list(islice(iterable, n)) allowed_chars = Set('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_') master_count = {} def add_hit(channel, hit_type, hit): if not master_count.get(channel): master_count[channel] = { 'emojis': {}, 'emojis_reactions': {}, } if not master_count.get(channel).get(hit_type).get(hit): master_count[channel][hit_type][hit] = 0 master_count[channel][hit_type][hit] += 1 def filter_emojis(text): return Set(text.replace(':', '')).issubset(allowed_chars) and len(text) > 2
def puzzle(grid_size, old_grid_size, nr_words): print "NR WORDS", nr_words iter = 0 png_nr = 0 # log_file = open(output_directory + r"\log.txt", "w") #sample words to follow follow_inds = Set() while len(follow_inds) < nr_words_to_follow: follow_inds.add(random.randrange(nr_words)) follow_inds = list(follow_inds) # log_file.write("Indexes:") # for key, value in global_index.iteritems(): # log_file.write(str(key)+" " + value.name + " " + str(value.id) + "\n") print "\n========\nSTART PUZZLING\n===========\n" log_file = open(log_file_n, 'a') log_file.write("\n\n========\nSTART PUZZLING\n===========\n\n") log_file.close() trial_nr = 0 nr_inits = 0 elem_indexes = range(nr_words) grid_size = int(grid_size) while not stop_condition(trial_nr): for i in range(nr_trials_check): if trial_nr % 5 == 0: print "\nTRIAL", trial_nr, datetime.datetime.now() log_file = open(log_file_n, 'a') log_file.write("TRIAL " + str(trial_nr) + " " + str(datetime.datetime.now()) + "\n") log_file.close() # check if you need to reinitialize if iter % (nr_words * nr_trials_re_init) == 0 or iter == 0: nr_inits += 1 print "\ninit closest trial", trial_nr, " start:", datetime.datetime.now( ) log_file = open(log_file_n, 'a') log_file.write("init closest at " + str(datetime.datetime.now()) + "\n") log_file.close() init_closest(grid_size, old_grid_size, iter == 0) print "init closest stop:", datetime.datetime.now() log_file = open(log_file_n, 'a') log_file.write("stop init closest at " + str(datetime.datetime.now()) + "\n") log_file.close() # log_file.write("INITIALIZED\n\n") # print_all_lists(str(trial_nr)) if iter == 0: stats_to_file("FIRST", trial_nr, follow_inds, nr_inits, grid_size, png_nr) png_nr += 1 # pick random element random.shuffle(elem_indexes) for elem_i in elem_indexes: [x, y] = list(global_index[elem_i].pos) if iter % 5000 == 0: print "iter", iter, swap_value = float("-inf") # check with which neighbor it wants to swap for dx in range(neighbor_range_swap[0], neighbor_range_swap[1]): for dy in range(neighbor_range_swap[0], neighbor_range_swap[1]): # print x, dx, y, dy if x + dx >= 0 and x + dx < grid_size and y + dy >= 0 and y + dy < grid_size: # check grid elem != none # print "in check" if grid_f[x + dx][y + dy] != None: v = grid_f[x][y].get_improvement( x + dx, y + dy) + grid_f[x + dx][ y + dy].get_improvement(x, y) else: v = grid_f[x][y].get_improvement( x + dx, y + dy) if v > swap_value: # process swap value swap_value = v swap_x = x + dx swap_y = y + dy if swap_value > 0: xy = grid_f[x][y] xy_swap = grid_f[swap_x][swap_y] grid_f[x][y] = xy_swap grid_f[swap_x][swap_y] = xy xy.change_pos(swap_x, swap_y) if xy_swap != None: xy_swap.change_pos(x, y) elif swap_value == float("-inf"): print "-inf" iter += 1 # figures and stats to file if trial_nr % to_file_trials == 0 and trial_nr != 0: stats_to_file(iter, trial_nr, follow_inds, nr_inits, grid_size, png_nr) png_nr += 1 trial_nr += 1 stats_to_file("LAST", trial_nr, follow_inds, nr_inits, grid_size, png_nr)
hg.interval_list([i[0]]).intersection(amplist) + hg.interval_list([i[0]]).intersection(rdList)) > 0 ]) rdList = hg.interval_list([ hg.interval(i.chrom, max(0, i.start - 10000), min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)])) for i in rdList ]) iout = open(outName + '.integration_search.out', 'w') iout.write(mystdout.getvalue()) iout.close() sys.stdout = old_stdout irdhops = [] irddict = {} irdSets = Set([Set([ird]) for ird in rdList]) irdgroupdict = {ird: Set([ird]) for ird in rdList} if args.extendmode == 'EXPLORE' or args.extendmode == 'VIRAL': for ird in rdList: logging.info("#TIME " + '%.3f\t' % (time() - TSTART) + "Exploring interval: " + str(ird)) old_stdout = sys.stdout sys.stdout = mystdout = StringIO() ilist = bamFileb2b.interval_hops(ird) irdhops.append((ird, ilist)) for i in ilist: irddict[i] = ird iout = open( outName + '.' + ird.chrom + ":" + str(ird.start) + '-' + str(ird.end) + '.out', 'w') iout.write(mystdout.getvalue())
def check_one_file(a, d1, dx1, FS, threshold, file_input, view_strings=False, new=True, library=True): d2 = None ret_type = androconf.is_android( file_input ) if ret_type == "APK": a = apk.APK( file_input ) d2 = dvm.DalvikVMFormat( a.get_dex() ) elif ret_type == "DEX": d2 = dvm.DalvikVMFormat( read(file_input) ) if d2 == None: return dx2 = analysis.VMAnalysis( d2 ) el = elsim.Elsim( ProxyDalvik(d1, dx1), ProxyDalvik(d2, dx2), FS, threshold, options.compressor, libnative=library ) el.show() print "\t--> methods: %f%% of similarities" % el.get_similarity_value(new) if options.dump: print '\nDumping smali code...' tmp1 = options.input[1].split('/') jarname = tmp1[len(tmp1)-1] if not os.path.exists('smali'): os.makedirs('smali') os.system('apktool d ' + options.input[1]) if jarname[len(jarname)-4:len(jarname)] == '.apk': os.system('mv -f ' + jarname[0:len(jarname)-4] + ' smali') else: os.system('mv -f ' + jarname + '.out ' + 'smali') classes = Set([]) diff_methods = el.get_similar_elements() for i in diff_methods: x = el.show_similar_class_name( i ) for j in range(0, len(x)): classes.add(x.pop()) new_methods = el.get_new_elements() for i in new_methods: y = el.show_new_class_name( i ) classes.add(y) if not os.path.exists('codedump'): os.makedirs('codedump') os.chdir('codedump') if os.path.exists(jarname): os.system('rm -rf ' + jarname) os.makedirs(jarname) os.chdir('..') for i in range(0,len(classes)): #os.makedirs('codedump/' + jarname) filepath = classes.pop() filename = filepath.replace('/','.') shutil.copy2('smali/' + jarname + '.out/smali/' + filepath, 'codedump/' + jarname + '/' + filename) os.system('rmdir codedump/' + jarname) classes1 = Set([]) for i in diff_methods: x = el.show_similar_method_name( i ) for j in range(0, len(x)): classes1.add(x.pop()) for i in new_methods: y = el.show_new_method_name( i ) classes1.add(y) start = '' end = '.end method' if not os.path.exists('methoddump'): os.makedirs('methoddump') for i in range(0,len(classes1)): x1 = classes1.pop() xx = x1.split(' ', 1) if not os.path.exists('methoddump/' + jarname): os.makedirs('methoddump/' + jarname) with open('codedump/' + jarname + '/' + xx[0]) as infile: for line in infile: if xx[1] in line: start = line.replace('\n','') break med = xx[1].split('(', 1)[0] with open('codedump/' + jarname + '/' + xx[0]) as infile, open('methoddump/' + jarname + '/' + xx[0] + '.' + med + '.method', 'w+') as outfile: copy = False outfile.write(start + '\n') for line1 in infile: if line1.strip() == start: copy = True elif line1.strip() == end: copy = False elif copy: outfile.write(line1) outfile.write(end) print 'DUMP SMALI CODE SUCCESSFULLY.' if options.display: print "SIMILAR methods:" diff_methods = el.get_similar_elements() for i in diff_methods: el.show_element( i ) print "IDENTICAL methods:" new_methods = el.get_identical_elements() for i in new_methods: el.show_element( i ) print "NEW methods:" new_methods = el.get_new_elements() for i in new_methods: el.show_element( i, False ) print "DELETED methods:" del_methods = el.get_deleted_elements() for i in del_methods: el.show_element( i ) print "SKIPPED methods:" skipped_methods = el.get_skipped_elements() for i in skipped_methods: el.show_element( i ) if view_strings: els = elsim.Elsim( ProxyDalvikStringMultiple(d1, dx1), ProxyDalvikStringMultiple(d2, dx2), FILTERS_DALVIK_SIM_STRING, threshold, options.compressor, libnative=library ) #els = elsim.Elsim( ProxyDalvikStringOne(d1, dx1), # ProxyDalvikStringOne(d2, dx2), FILTERS_DALVIK_SIM_STRING, threshold, options.compressor, libnative=library ) els.show() print "\t--> strings: %f%% of similarities" % els.get_similarity_value(new) if options.display: print "SIMILAR strings:" diff_strings = els.get_similar_elements() for i in diff_strings: els.show_element( i ) print "IDENTICAL strings:" new_strings = els.get_identical_elements() for i in new_strings: els.show_element( i ) print "NEW strings:" new_strings = els.get_new_elements() for i in new_strings: els.show_element( i, False ) print "DELETED strings:" del_strings = els.get_deleted_elements() for i in del_strings: els.show_element( i ) print "SKIPPED strings:" skipped_strings = els.get_skipped_elements() for i in skipped_strings: els.show_element( i )
from math import sqrt as sqrt def sod(n): sum = 1 for i in range(2, int(sqrt(n))): if (n % i == 0): sum += i sum += n / i sq = int(sqrt(n)) if (n % sq == 0): sum += sq return sum l = [0 for i in range(10000)] ans = [] for i in range(1, 10000): l[i] = sod(i) if (l[i] < i): if (l[l[i]] == i): ans.append(i) ans.append(l[i]) from sets import Set ans = Set(ans) print ans x = sum(ans) print x
def get_years(self): years = Set() incidents = Incident.objects.all() for inc in incidents: years.add(inc.year) return years
def deaccent(s): return s \ .replace(u'ά', u'α') \ .replace(u'έ', u'ε') \ .replace(u'ή', u'η') \ .replace(u'ί', u'ι') \ .replace(u'ό', u'ο') \ .replace(u'ύ', u'υ') \ .replace(u'ώ', u'ω') \ .replace(u'ς', u'σ') import os crawlerdir = os.environ['CRAWLERDIR'] expletives = Set() with open(crawlerdir + "greekdata/expletives", "r") as f: for line in f: expletives.add(deaccent(unicode(line, 'utf-8').strip().lower())) articles = Set() with open(crawlerdir + "greekdata/articles", "r") as f: for line in f: articles.add(deaccent(unicode(line, 'utf-8').strip().lower())) pronouns = Set() with open(crawlerdir + "greekdata/pronouns", "r") as f: for line in f: pronouns.add(deaccent(unicode(line, 'utf-8').strip().lower())) locations = Set() with open(crawlerdir + "greekdata/locations", "r") as f: for line in f: locations.add(deaccent(unicode(line, 'utf-8').strip().lower()))
def feat_extr_ngram(row_id_str, hdfs_dir_list, hdfs_feat_dir, model_data_folder , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, fromweb, label_arr, metadata_count,label_idx,data_idx, pattern_str, ln_delimitor, data_field_list, jkey_dict , jobname, num_gram, feature_count_threshold, token_dict=None, HDFS_RETR_DIR=None, remove_duplicated="N" , cust_featuring=None, cust_featuring_params=None, local_out_dir=None, filter_ratio=None ): # zip func in other files for Spark workers ================= ================ zip_file_path=ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, user_custom=cust_featuring) # get_spark_context sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) # log time ================================================================ ================ t0 = time() # input filename input_filename="*" ext_type='.gz' gz_list=None convert2dirty="N" if not ',' in hdfs_dir_list: # single dir having *.gz ==== ========= # read raw data from HDFS as .gz format ========== rdd_files=os.path.join(hdfs_dir_list, input_filename+ext_type) # check if gz files in hdfs ============ try: gz_list=hdfs.ls(hdfs_dir_list) print "INFO: check hdfs folder=",hdfs_dir_list except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info()[0] # use whole folder if gz_list is None or len(gz_list)==0: rdd_files=hdfs_dir_list print "ERROR: No file found by ",input_filename+ext_type #,", use",hdfs_dir_list,"instead" return -2 else: # multiple dirs ==== ========= rdd_files="" cnt=0 temp_lbl_list=[] comma="" print "INFO: before label_arr=",label_arr # check each folder for dr in hdfs_dir_list.split(','): print "****=",dr if not len(dr)>0: continue try: # remove space etc. dr=dr.strip() fdr=os.path.join(HDFS_RETR_DIR, dr) print "fdr=",fdr # ls didn't like "*" if '*' in fdr: #gz_list=hdfs.ls(fdr.replace("*","")) dn=os.path.dirname(fdr).strip() bn=os.path.basename(fdr).strip() #print "dn=",dn,",bn=",bn # get all names under folder and do filtering gz_list=fnmatch.filter(hdfs.ls(dn), '*'+bn) #print "gz_list=",gz_list else: gz_list=hdfs.ls(fdr) cnt=cnt+len(gz_list) if len(gz_list)>0: rdd_files=rdd_files+comma+fdr comma="," except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info()[0] # use whole folder if cnt is None or cnt==0: print "ERROR: No file found at",rdd_files return -2 else: print "INFO: total file count=",cnt # set convert flag only when multiple dir and label_arr has dirty label #if label_arr is None: # create label arr if None # label_arr=temp_lbl_list if not label_arr is None and len(label_arr)==2 and label_arr[1]=="dirty": convert2dirty="Y" print "INFO: rdd_files=",rdd_files txt_rdd=sc.textFile(rdd_files)#, use_unicode=False total_input_count=txt_rdd.count() print "INFO: Total input sample count=",total_input_count # debug only #for x in txt_rdd.collect(): # print "t=",x print "INFO: hdfs_dir_list=",hdfs_dir_list print "INFO: label_arr=",label_arr print "INFO: feature_count_threshold=",feature_count_threshold #jkey_dict={"meta_list":["label","md5","mdate"], "data_key":"logs"} # this dict depends on the format of input data if not data_field_list is None: jkey_dict=json.loads(jkey_dict) data_key=jkey_dict["data_key"] meta_list=jkey_dict["meta_list"] metadata_count=len(meta_list) data_idx=metadata_count print "INFO: jkey_dict=",jkey_dict print "INFO: meta_list=",meta_list print "INFO: data_key=",data_key print "INFO: data_field_list=",data_field_list print "INFO: metadata_count=",metadata_count featured_rdd = txt_rdd \ .map(lambda x: preprocess_json(x,meta_list,data_key,data_field_list)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() #print "INFO: featured_rdd=" #for x in featured_rdd.collect(): # print "INFO: **** f=",x # user custom code for featuring ============================================= ========== # input txt_rdd format (string): each text row for each sample # output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] elif not cust_featuring is None and len(cust_featuring)>0: user_module=None user_func=None user_func_dnn=None # load user module ======= try: modules = map(__import__, [CUSTOM_PREFIX+cust_featuring]) user_module=modules[0] user_func=getattr(user_module,CUSTOM_FUNC) except Exception as e: print "ERROR: user module error.", e.__doc__, e.message return -101 # prepare for dnn, output as feat in an array tmp_rdd = txt_rdd.map(lambda x: user_func(x, cust_featuring_params)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list).cache() # for traditional ML, feat in a dict featured_rdd = tmp_rdd \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() all_hashes_cnt_dic=None all_hash_str_dic=None all_hashes_seq_dic = None else: print "INFO: pattern_str=",pattern_str+"<--" print "INFO: ln_delimitor=",ln_delimitor+"<--" print "INFO: label_idx=",label_idx print "INFO: data_idx=",data_idx print "INFO: metadata_count=",metadata_count print "INFO: filter_ratio=",filter_ratio # filter top and least percentage of feature if not filter_ratio is None and filter_ratio > 0 and filter_ratio <1: # check total count here before continue upper_cnt=total_input_count*(1-filter_ratio) lower_cnt=total_input_count*filter_ratio # set limit for lower bound. if total count is large, lower_cnt may exclude all features... # max lower count = min( MAX_FILTER_LOWER_CNT, total_input_count/100 ) if not MAX_FILTER_LOWER_CNT is None and lower_cnt > MAX_FILTER_LOWER_CNT: if MAX_FILTER_LOWER_CNT > total_input_count/100: lower_cnt=total_input_count/100 else: lower_cnt=MAX_FILTER_LOWER_CNT print "INFO: filtering by count, upper bound=",upper_cnt,",lower bound=",lower_cnt # find unique feature, count them, remove them if in highest and lowest % and then create a dict f_feat_set = Set (txt_rdd.map(lambda x:x.split(ln_delimitor)).flatMap(lambda x:Set(x[metadata_count:])) \ .map(lambda x:(x,1)).reduceByKey(lambda a, b: a + b) \ .filter(lambda x:x[1]<= upper_cnt and x[1]>= lower_cnt) \ .map(lambda x:x[0]).collect() ) print "INFO: f_feat_set len=",len(f_feat_set) broadcast_f_set = sc.broadcast(f_feat_set) #txt_rdd=txt_rdd.map(lambda x: filter_by_list(x, metadata_count,ln_delimitor, broadcast_f_list.value )) txt_rdd=txt_rdd.map(lambda x: x.split(ln_delimitor)) \ .map(lambda x: x[:metadata_count]+ [w for w in x[metadata_count:] if w and w in broadcast_f_set.value]) \ .map(lambda x: ln_delimitor.join(x)) # preprocess by pattern matching and then extract n-gram features #.encode('UTF8') # input txt_rdd format (string): meta-data1\tmeta-data2\t...\tdataline1\tdataline2\t...datalineN\n # output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] # hash_cnt_dic: {hash,hash:count,...} hash_str_dic: {hash: 'str1',... } tmp_rdd = txt_rdd \ .map(lambda x: preprocess_pattern(x, metadata_count, pattern_str, ln_delimitor \ , label_idx, label_arr, convert2dirty )) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) #.cache() memory issue... #tmp_rdd_count=tmp_rdd.count() #print "INFO: After preprocessing count=",tmp_rdd_count featured_rdd = tmp_rdd \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() #feat_rdd_count=featured_rdd.count() #print "INFO: After featuring count=",feat_rdd_count all_hashes_cnt_dic=None all_hash_str_dic=None all_hashes_seq_dic = None #get all hashes and total occurring count =============== # all_hashes_cnt_dic: {'hash,hash': total count,... } if all_hashes_cnt_dic is None: #all_hashes_cnt_dic = featured_rdd.map(lambda x: x[metadata_count]).reduce(lambda a, b: combine_dic_cnt(a, b)) all_hashes_cnt_dic = dict(featured_rdd.flatMap(lambda x: x[metadata_count].items()).reduceByKey(lambda a, b: a + b).collect()) #get all hashes and their extracted string =============== # all_hash_str_dic: {hash:'str1', ... if all_hash_str_dic is None: #all_hash_str_dic = featured_rdd.map(lambda x: x[metadata_count+1]).reduce(lambda a, b: combine_dic(a, b)) all_hash_str_dic = dict(featured_rdd.flatMap(lambda x: x[metadata_count+1].items()).distinct().collect()) # get all labels into an array =============== provided by parameter? if label_arr is None: # will force "clean" be 0 here label_arr=sorted(featured_rdd.map(lambda x: x[label_idx].lower()).distinct().collect()) # debug only print "INFO: label_arr=",json.dumps(sorted(label_arr)) # save labels to hdfs as text file==================================== ============ hdfs_folder = hdfs_feat_dir #+ "/" # "/" is needed to create the folder correctly print "INFO: hdfs_folder=", hdfs_folder try: hdfs.mkdir(hdfs_folder) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0] # clean up metadata_file metadata_file = os.path.join(hdfs_folder , metadata) #"metadata" print "INFO: metadata_file=", metadata_file try: hdfs.rmr(metadata_file) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at rmr():", sys.exc_info()[0] sc.parallelize(label_arr,1).saveAsTextFile(metadata_file) #remap all hash values to continuous key/feature number ============== # all_hashes_seq_dic: { hash : sequential_numb } if all_hashes_seq_dic is None: all_hashes_seq_dic={} remap2seq(all_hashes_cnt_dic, all_hashes_seq_dic) #all_hashes_seq_dic has continuous key number #print "all_hashes_seq_dic=",all_hashes_seq_dic total_feature_numb=len(all_hashes_seq_dic) print "INFO: Total feature count=", len(all_hashes_seq_dic) # featured_rdd (list): [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] # seq_featured_rdd(list): [meta-data1,meta-data2,..., hash_cnthsh_dict, hash_str_dic] (feat id in sorted sequence) # hash_cnt_dic: {hash: count} hash_str_dic: {hash: 'str1,str2...' } # set binary_flag to True, all feature:value will be 1 broadcast_dic = sc.broadcast(all_hashes_seq_dic) seq_featured_rdd = featured_rdd.map(lambda x: convert2seq(x,label_idx,data_idx,broadcast_dic.value,binary_flag= True)).cache() # get hash_cnthsh_dict then flatMap and reduce to (feat id, count) ct_rdd=seq_featured_rdd.flatMap(lambda x: [(i[0],i[1]) for i in x[data_idx].iteritems()]).reduceByKey(lambda a, b: a + b) # sorted by feature id as int feat_sample_count_arr=ct_rdd.sortBy(lambda x:int(x[0])).map(lambda x:x[1]).collect() # sort after collect may fail when rdd is huge #feat_sample_count_arr=[] #for i in sorted(ct_rdd.collect(), key=lambda t: int(t[0])): # feat_sample_count_arr.append(i[1]) print "INFO: feat_sample_count_arr len=",len(feat_sample_count_arr) # save feat_sample_count_arr data ==================================== ============ filter='{"rid":'+row_id_str+',"key":"feat_sample_count_arr"}' upsert_flag=True jo_insert={} jo_insert["rid"]=eval(row_id_str) jo_insert["key"]="feat_sample_count_arr" jo_insert["value"]=feat_sample_count_arr jstr_insert=json.dumps(jo_insert) ret=query_mongo.upsert_doc_t(mongo_tuples,filter,jstr_insert,upsert_flag) print "INFO: Upsert count for feat_sample_count_arr=",ret # insert failed, save to local if ret==0: # drop old record in mongo ret=query_mongo.delete_many(mongo_tuples,None,filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fsca_hs=os.path.join(local_out_dir,row_id_str,row_id_str+"_feat_sample_count_arr.pkl") print "WARNING: save feat_sample_count_arr to local" ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs) # save feature data; TBD. not used. ==================================== ============ #libsvm_rdd=seq_featured_rdd.map(lambda x: convert2libsvm(x,label_idx,data_idx,label_arr)) # put hash to the front of each row, assume hash is after label libsvm_rdd=seq_featured_rdd.map(lambda x: x[label_idx+1]+" "+convert2libsvm(x,label_idx,data_idx,label_arr)) # debug only #print "libsvm_rdd=" #for i in libsvm_rdd.collect(): # print i # get rdd statistics info stats= featured_rdd.map(lambda p: len(p[metadata_count])).stats() feat_count_max=stats.max() feat_count_stdev=stats.stdev() feat_count_mean=stats.mean() sample_count=stats.count() print "INFO: libsvm data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev print "INFO: ,max feature count=",feat_count_max # find sample count lbl_arr=featured_rdd.map(lambda x: (x[label_idx],1)).reduceByKey(add).collect() print "INFO: Sample count by label=",lbl_arr # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx if remove_duplicated=="Y": libsvm_rdd=libsvm_rdd \ .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \ .groupByKey().map(lambda x: list(x[1])[0] ) \ .cache() cnt_list= libsvm_rdd.map(lambda x: (x.split(' ')[1],1)).reduceByKey(add).collect() stats= libsvm_rdd.map(lambda x: len(x.split(' ')[metadata_count:])).stats() feat_count_max=stats.max() feat_count_stdev=stats.stdev() feat_count_mean=stats.mean() sample_count=stats.count() print "INFO: Non-Duplicated libsvm data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev print "INFO: ,max feature count=",feat_count_max print "INFO: Non-Duplicated Label count list=",cnt_list # save libsvm data ==================================== ============ libsvm_data_file = os.path.join(hdfs_folder , libsvm_alldata_filename) #"libsvm_data" print "INFO: libsvm_data_file=", libsvm_data_file try: #hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(libsvm_data_file) if num_gram == 1: hdfs.rmr(dnn_data_file) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info()[0] #codec = "org.apache.hadoop.io.compress.GzipCodec" #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec) libsvm_rdd.saveAsTextFile(libsvm_data_file) # TBD encrypted feat_count_file = libsvm_data_file+"_feat_count" print "INFO: feat_count_file=", feat_count_file try: hdfs.rmr(feat_count_file) except IOError as e: print "WARNING: I/O error({0}): {1} at feat_count clean up".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info()[0] sc.parallelize([total_feature_numb],1).saveAsTextFile(feat_count_file) label_dic = {} # assign label a number for idx, label in enumerate(sorted(label_arr)): if not label in label_dic: label_dic[label] = idx #starting from 0, value = idx, e.g., clean:0, dirty:1 # output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN =========== if num_gram == 1: # special flag to tokenize and keep input orders print "INFO: processing data for DNN..." # create token dict # str_hash_dict: string to hash # all_hashes_seq_dic: hash to seq id if token_dict is None or len(token_dict)==0: token_dict={} str_hash_dict={v: k for k, v in all_hash_str_dic.iteritems()} for k,v in str_hash_dict.iteritems(): token_dict[k]=int(all_hashes_seq_dic[str(v)]) #print "token_dict=",len(token_dict),token_dict dnn_rdd = tmp_rdd \ .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) #.cache() # filter duplication here #print dnn_rdd.take(3) dnn_data_file = os.path.join(hdfs_folder , dnn_alldata_filename) #"dnn_data" print "INFO: dnn_data_file=", dnn_data_file try: hdfs.rmr(dnn_data_file) except IOError as e: print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info()[0] try: dnn_rdd.saveAsTextFile(dnn_data_file) except: print "WARNING: Unexpected error at saving dnn data:", sys.exc_info()[0] try: stats= dnn_rdd.map(lambda p: len(p[metadata_count])).stats() feat_count_max=stats.max() feat_count_stdev=stats.stdev() feat_count_mean=stats.mean() sample_count=stats.count() print "INFO: DNN data: sample count=",sample_count,",Feat count mean=",feat_count_mean,",Stdev=",feat_count_stdev print "INFO: ,max feature count=",feat_count_max except: print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info()[0] # clean up pca data in hdfs ============ ======================== pca_files= '*'+libsvm_alldata_filename+"_pca_*" #print "INFO: pca_files=", pca_files try: f_list=hdfs.ls(hdfs_folder) if len(f_list)>0: df_list=fnmatch.filter(f_list,pca_files) for f in df_list: print "INFO: rm ",f hdfs.rmr(f) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info()[0] # clean up pca data in web local ============ ======================== pca_fname=os.path.join(model_data_folder , row_id_str+'_pca_*.pkl*') print "INFO: pca_fname=", pca_fname try: for fl in glob.glob(pca_fname): print "INFO: remove ", fl os.remove(fl) except OSError, e: print ("Error: %s - %s." % (e.pca_fname,e.strerror))
''' Prior Audience Sample, to compare levels 0 = Weak 1 = Medium 2 = Strongest ''' def _getSample(): return random.randrange(MIN_VAL, MAX_VAL + 1) prior = 1 #initiliaze the richard jokesAndResponses = {} #set of told jokes, and their response jokesTold = 0 heuristics = Set() ''' Main Function Here ''' class heuristic(object): def __init__(self, t): self.type = t self.fails = 0 self.prob = 100 #out of a hundred percent def getInfo(self): return [self.type, self.fails, self.prob] def getType(self):
def readGraph(file, n, p, mean, std_dev, PView, PShare, content_count): G1 = LoadEdgeList(PUNGraph, file, 0, 1) n = G1.GetNodes() CmtyVt = TCnComV() #Getting the Community CommunityCNM(G1, CmtyVt) nodes = TIntV() for N in G1.GetNI(0).GetOutEdges(): nodes.Add(N) G1 = GetSubGraph(G1, nodes) #Drawing the original Community Graph #DrawGViz(G1, gvlDot, "graph1.png", "graph 1") Graph = {} for u in G1.Nodes(): for v in u.GetOutEdges(): if Graph.has_key(u.GetId()): Graph[u.GetId()].add(v) else: Graph.update({u.GetId(): Set([v])}) #Initialize the probability Vectors. for i in range(0, n): PView += [[0 for j in range(0, n)]] PShare += [[0 for j in range(0, n)]] #Populating the probability vectors. for v in G1.Nodes(): for u in v.GetOutEdges(): id_src = v.GetId() id_dst = u view_prob = np.random.binomial(n, p, 1)[0] / (n * 1.0) share_prob = np.random.binomial(n, p, 1)[0] / (n * 1.0) PView[id_dst][id_src] = view_prob PShare[id_dst][id_src] = share_prob #Content forest each entry in this array is a forest for some content. content_forest = [] # 4039 * 0.148 #Number of cotent introduction point. content_intro_count = 60 #Generating forest for each content. for i in range(0, content_count): #Generating the random introduction points i.e. the users who introduce the content. random_sample = random.sample(range(0, n), content_intro_count) #Generating the forest for a content content_forest.append(BFS(Graph, random_sample, PView, PShare)) new_Graph = {} weight = {} G2 = TNGraph.New() for i in range(0, n): G2.AddNode(i) #Generating the inferred graph for cf in content_forest: for e in cf.Edges(): if new_Graph.has_key(e.GetSrcNId()) and e.GetDstNId() in new_Graph[ e.GetSrcNId()]: weight[str(e.GetSrcNId()) + ',' + str(e.GetDstNId())] = weight[str(e.GetSrcNId()) + ',' + str(e.GetDstNId())] + 1 elif new_Graph.has_key(e.GetSrcNId()): new_Graph[e.GetSrcNId()].add(e.GetDstNId()) weight.update( {str(e.GetSrcNId()) + ',' + str(e.GetDstNId()): 1}) G2.AddEdge(e.GetSrcNId(), e.GetDstNId()) else: new_Graph.update({e.GetSrcNId(): Set([e.GetDstNId()])}) weight.update( {str(e.GetSrcNId()) + ',' + str(e.GetDstNId()): 1}) G2.AddEdge(e.GetSrcNId(), e.GetDstNId()) #Sum of Weights of all the neighbours of a vertex TWeight = {} for u in new_Graph.keys(): sum = 0 for v in new_Graph[u]: sum += weight[str(u) + ',' + str(v)] TWeight.update({u: sum}) #Calculating Edge Confidence for u in new_Graph.keys(): for v in new_Graph[u]: weight[str(u) + ',' + str(v)] = weight[str(u) + ',' + str(v)] / (TWeight[u] * 1.0) #calculating conf_threshold conf_thershold = mean + 0 * std_dev #Generating the graph whose edges have conf_value greate than conf_threshold G3 = TUNGraph.New() nodes = [] for u in new_Graph.keys(): for v in new_Graph[u]: if weight[str(u) + ',' + str(v)] >= conf_thershold: if u not in nodes: G3.AddNode(u) nodes += [u] if v not in nodes: G3.AddNode(v) nodes += [v] G3.AddEdge(u, v) #Drawing the inferred Graph DrawGViz(G3, gvlDot, "graph3.png", "graph 2")
def __init__(self, h): self.allH = Set() for elm in h: self.allH.add(heuristic(elm))
class ExternalProgramTestSuite: """ A Class for creating Test Suites with test cases which call external programs """ # internal static variables _test_suites = {} _num_formatting_chars = 100 _all_log_files = Set() _has_run = False _framework_output_file = None # public static variables color_output_text = True suite_header_color = Fore.MAGENTA case_header_color = Fore.CYAN suite_result_header_color = Fore.YELLOW def __init__(self, **kwargs): # reset the suite variables self._set_suite_defaults() # test suite name is the name of the suite class # or the suite_name arg if passed if 'suite_name' in kwargs: if assert_variable_type(kwargs['suite_name'], str): self.suite_name = kwargs['suite_name'] else: # suite name defaults to class name self.suite_name = self.__class__.__name__ # add test suite to class static total list try: # make sure a suite with the same # name does not already exist if self.suite_name not in ExternalProgramTestSuite._test_suites: ExternalProgramTestSuite._test_suites[self.suite_name] = { 'self': self, 'name': self.suite_name, 'description': self.suite_description, 'args': kwargs, 'num_passed': 0, 'num_tests': 0, 'num_checks': 0, 'num_checks_passed': 0, 'execution_time': 0, 'has_run': False, 'pass_threshold': 100, 'passed': False } else: raise ValueError( 'A suite with the name "%s" already exists. ' 'Please rename one of suite classes or pass a unique "suite_name" argument to one or both of the constructors.' ) except ValueError as e: raise Exception('[%s] %s' % (type(e).__name__, e)) def log(self, print_string, error=False, color=Fore.RESET): """Wrapper over print function to allow writing test framework output to file if desired. """ # write the print output to the log files if self.log_framework_output: if error and self.stderr_file is not None: with open(self.stderr_file, 'a') as f: f.write(print_string + "\r\n") elif self.stdout_file is not None: with open(self.stdout_file, 'a') as f: f.write(print_string + "\r\n") # print the output and color appropriately if ExternalProgramTestSuite.color_output_text: print(color + print_string + Fore.RESET + Back.RESET + Style.RESET_ALL) # Aptana's interactive console doesn't accept ANSI escape # characters but at least it colors the stderr red so # separate normal output from error output appropriately else: if error: sys.stderr.write(print_string + "\r\n") sys.stderr.flush() else: sys.stdout.write(print_string + "\r\n") sys.stdout.flush() def _set_suite_defaults(self): """Set the suite variables to their defaults """ # set the default suite variables # default suite name and description self.suite_name = None self.suite_description = None # number of passed test cases self._num_tests_passed = 0 # num checks and failures self._total_checks_passed = 0 self._total_checks = 0 # threshold in percentage of tests # passed to decide status of suite self.suite_pass_threshold = 100 # whether to truncate the log file # before writing to it self.overwrite_log_file = True # whether to print process output # or just write it to the log file self.print_process_output = True self.log_framework_output = False # default log path self._default_log_file = "run.log" self.stdout_file = self.stderr_file = self._default_log_file # setup and teardown function self._suite_setup = None self._suite_teardown = None # timelimit values self._suite_timelimit_met = True self.suite_timelimit = None self.suite_case_timelimit = None # invalid args list self._invalid_args = [] def _set_case_defaults(self): """ Set the case variables to their defaults """ # default test case variables self._name = None # whether to print process output # or just write it to the log file self.print_case_output = self.print_process_output # default log path self.stdout_file = self.stderr_file = self._default_log_file # default case description self._description = None # num checks and failures self._num_checks_passed = 0 self._num_checks = 0 # threshold in percentage of checks # passed to decide status of case self.case_pass_threshold = 100 # test case time limit self._timelimit = self.suite_case_timelimit # wait to print case header self._wait_sem = 0 # fixture, setup, teardown self._fixture = None self._case_setup = None self._case_teardown = None def _setup_suite(self, **kwargs): """ Set the suite variables """ # if a test suite requires common variables across all test cases, # they can be passed through kwargs and are set here for key, value in kwargs.items(): if type(value) is str: exec('self.' + str(key) + '="' + value + '"') in globals(), locals() else: exec('self.' + str(key) + '=' + str(value)) in globals(), locals() # each function in a test suite class is a test case # so get the cases and add them to the testSuites list test_names = { key: value for key, value in self.__class__.__dict__.items() if isinstance(value, FunctionType) } self.test_cases = [] for name in test_names: if name == "setup": self._suite_setup = getattr(self, name) elif name == "teardown": self._suite_teardown = getattr(self, name) elif 'fixture' not in name.lower(): self.test_cases.append(name) def _setup_case(self): # if a suite has startted running and the overwrite log file # flag was set to True, truncate the log files if self.overwrite_log_file and ( not ExternalProgramTestSuite._has_run or len([ (x) for x in [self.stdout_file, self.stderr_file] if x not in ExternalProgramTestSuite._all_log_files ]) > 0): for log_file in [self.stdout_file, self.stderr_file]: ExternalProgramTestSuite._all_log_files.add(log_file) with open(log_file, 'w') as f: f.truncate(0) def _end_case(self): # call fixture teardown if set if self._case_teardown is not None: if isinstance(self._case_teardown, MethodType): self._case_teardown(self) else: self._case_teardown() def _validate_argument(self, argument, types): key = argument.keys()[0] valid, message = assert_variable_type(argument[key], types, False) if not valid: self._invalid_args.append("%s: %s" % (key, message)) def _validate_suite_arguments(self): """ Validate test suite argument types """ # reset invalid args list self._invalid_args = [] #string string_vars = [{ "suite_description": self.suite_description }, { "stdout_file": self.stdout_file }, { "stderr_file": self.stderr_file }] [self._validate_argument(x, [str, NoneType]) for x in string_vars] # bool bool_vars = [{ "overwrite_log_file": self.overwrite_log_file }, { "print_process_output": self.print_process_output }, { "log_framework_output": self.log_framework_output }] [self._validate_argument(x, bool) for x in bool_vars] # float float_vars = [{ "suite_timelimit": self.suite_timelimit }, { "suite_case_timelimit": self.suite_case_timelimit }] [ self._validate_argument(x, [int, float, NoneType]) for x in float_vars ] # functions function_vars = [{ "suite setup": self._suite_setup }, { "suite teardown": self._suite_teardown }] [ self._validate_argument(x, [MethodType, NoneType]) for x in function_vars ] # raise exception if any invalid args if len(self._invalid_args) > 0: raise InvalidArgument(('\r\n').join(self._invalid_args)) def _validate_test_arguments(self): """ Validate test case argument types """ # reset invalid args list self._invalid_args = [] #string string_vars = [{ 'description': self._description }, { 'name': self._name }] [self._validate_argument(x, [str, NoneType]) for x in string_vars] # float float_vars = [{'timelimit': self._timelimit}] [ self._validate_argument(x, [int, float, NoneType]) for x in float_vars ] # fixture try: if self._fixture is not None: self._case_setup, self._case_teardown = self._fixture() except Exception: self._invalid_args.append( 'a proper fixture returning a setup and teardown function was not provided' ) # functions (fixture override) function_vars = [{ 'case setup': self._case_setup }, { 'case teardown': self._case_teardown }] [ self._validate_argument(x, [FunctionType, MethodType, NoneType]) for x in function_vars ] # raise exception if any invalid args if len(self._invalid_args) > 0: raise InvalidArgument(('\r\n').join(self._invalid_args)) def run(self, suite_name=None): """ Run the test suite """ # capture start time suite_start_time = timeit.default_timer() # setup suite if suite_name is None: suite_name = self.suite_name self._setup_suite( **ExternalProgramTestSuite._test_suites[suite_name]['args']) # validate suite args try: self._validate_suite_arguments() except Exception as e: ExternalProgramTestSuite._test_suites[ self.suite_name]['has_run'] = True raise SuiteError('Error in test suite "%s" [%s] %s' % (suite_name, type(e).__name__, e)) # run all the test cases for index, case in enumerate(sorted(self.test_cases)): self.test_case = getattr(self, case) if not self.test_case: raise Exception("Test Case %s does not exist" % str(self.test_case)) # reset the default suite/case variables self._set_case_defaults() # set test case name to case self._name = case # suite setup routine self._setup_case() # print test suite name and descripion if any # and if first loop through cases if index == 0: self.log("=" * ExternalProgramTestSuite._num_formatting_chars) self.log("TEST SUITE: %s" % suite_name, False, ExternalProgramTestSuite.suite_header_color) if self.suite_description: self.log("Description: %s" % (self.suite_description)) ExternalProgramTestSuite._test_suites[suite_name][ 'description'] = self.suite_description # call suite setup function if set if self._suite_setup is not None: self._suite_setup() # run the test case try: self._run_test_case() except Exception as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) # set has_run flags ExternalProgramTestSuite._has_run = True # set suite attributes for static _test_suites list ExternalProgramTestSuite._test_suites[ self.suite_name]['has_run'] = True ExternalProgramTestSuite._test_suites[ self.suite_name]['pass_threshold'] = self.suite_pass_threshold # end case routine self._end_case() # capture suite end time suite_end_time = timeit.default_timer() suite_time_taken = suite_end_time - suite_start_time ExternalProgramTestSuite._test_suites[ self.suite_name]['execution_time'] = suite_time_taken # if a timelimit was set # check if it was met if self.suite_timelimit is not None: self.log("_" * ExternalProgramTestSuite._num_formatting_chars) if suite_time_taken <= self.suite_timelimit: self.log( 'CHECK PASS: suite completed before time limit of %.4f' % self.suite_timelimit, False, Back.GREEN) self._total_checks_passed += 1 else: self.log( 'CHECK FAIL: suite did not complete before time limit of %.4f' % self.suite_timelimit, True, Back.RED) self._suite_timelimit_met = False self._total_checks += 1 # call suite teardown function if set if self._suite_teardown is not None: self._suite_teardown() # print test result self._print_suite_results() def case_header(self): """ Test case header output """ # print case name self.log("-" * ExternalProgramTestSuite._num_formatting_chars) self.log("CASE: %s" % self._name, False, ExternalProgramTestSuite.case_header_color) # print description if any if self._description is not None: self.log("Description: %s" % (str(self._description))) self.log("-" * ExternalProgramTestSuite._num_formatting_chars) # validate args try: self._validate_test_arguments() except Exception as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) # call fixture setup if set if self._case_setup is not None: if isinstance(self._case_setup, MethodType): self._case_setup(self) else: self._case_setup() def _run_test_case(self): """ Run an individual test case """ # read source file to see decorators and # call case_header at the right time test_function = self._name suite_class = str(self.__class__).rpartition('.')[2] lines = [] save_lines = False with open(inspect.getmodule(self.__class__).__file__) as f: for line in f: if suite_class in line: save_lines = True if save_lines and test_function in line: break if save_lines and 'def ' in line: lines = [] if (save_lines and len(line.strip()) > 0 and line.strip()[0] == "@"): lines.append(line.strip().rpartition('(')[0]) # semaphore to wait for calling # case_header after all decorators self._wait_sem = len(lines) # if semaphor is 0 print case header immediately if self._wait_sem == 0: self.case_header() # run test case execution_time = timeit.timeit(self.test_case, number=1) # if a timelimit was set # check if it was met if self._timelimit is not None: if execution_time <= self._timelimit: self.log( 'CHECK PASS: test completed before time limit of %.4f' % self._timelimit, False, Back.GREEN) self._num_checks_passed += 1 else: self.log( 'CHECK FAIL: test did not complete before time limit of %.4f' % self._timelimit, True, Back.RED) self._num_checks += 1 # print pass/fail, execution time if self._num_checks > 0: percentage_passed = (self._num_checks_passed * 1.0 / self._num_checks) * 100 else: percentage_passed = 0 output_string = ("%d/%d (%.2f%%) CHECKS in %.4f seconds" % (self._num_checks_passed, self._num_checks, percentage_passed, execution_time)) if percentage_passed >= self.case_pass_threshold or self._num_checks == 0: output_string += " TEST PASS" if self.case_pass_threshold != 100: output_string += " with %.2f%% threshold" % self.case_pass_threshold self.log(output_string, False, Back.GREEN) self._num_tests_passed += 1 else: output_string += " TEST FAIL" self.log(output_string, False, Back.RED) self._total_checks += self._num_checks self._total_checks_passed += self._num_checks_passed def _print_suite_results(self): self.log("*" * ExternalProgramTestSuite._num_formatting_chars) self.log("SUITE RESULT", False, ExternalProgramTestSuite.suite_result_header_color) self.log("*" * ExternalProgramTestSuite._num_formatting_chars) passed = self._print_info_and_status() self.log("=" * ExternalProgramTestSuite._num_formatting_chars) # add test result to class static suite list ExternalProgramTestSuite._test_suites[ self.suite_name]['num_tests'] = len(self.test_cases) ExternalProgramTestSuite._test_suites[ self.suite_name]['num_passed'] = self._num_tests_passed ExternalProgramTestSuite._test_suites[ self.suite_name]['passed'] = passed ExternalProgramTestSuite._test_suites[ self.suite_name]['num_checks'] = self._total_checks ExternalProgramTestSuite._test_suites[ self.suite_name]['num_checks_passed'] = self._total_checks_passed def _print_info_and_status(self, suite_name=""): num_tests = len(self.test_cases) passed = False try: if num_tests > 0: percentage_tests_passed = (self._num_tests_passed * 1.0 / num_tests) * 100 else: percentage_tests_passed = 0 if self._total_checks > 0: percentage_checks_passed = (self._total_checks_passed * 1.0 / self._total_checks) * 100 else: percentage_checks_passed = 0 output_string = ( "%s%d/%d (%.2f%%) TESTS with %d/%d (%.2f%%) CHECKS in %.4f seconds" % (suite_name, self._num_tests_passed, num_tests, percentage_tests_passed, self._total_checks_passed, self._total_checks, percentage_checks_passed, ExternalProgramTestSuite._test_suites[ self.suite_name]['execution_time'])) if percentage_tests_passed >= self.suite_pass_threshold and self._suite_timelimit_met: output_string += " OK" if self.suite_pass_threshold != 100: output_string += " with %.2f%% threshold" % self.suite_pass_threshold self.log(output_string, False, Back.GREEN) passed = True else: output_string += " NOT OK" self.log(output_string, False, Back.RED) except Exception as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) return passed def check_subprocess(self, executable_command, command_arguments, expected_returncode, timeout=None, print_process_output=True, stdout_file=None, stderr_file=None, poll_seconds=.100): process = None try: process, execution_time = run_subprocess( executable_command, command_arguments, timeout, print_process_output, stdout_file, stderr_file, poll_seconds) except OSError as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) except ValueError as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) except TimeoutError as e: self.log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) # print pass/fail, execution time if process is not None: if process.returncode == expected_returncode: self.log('CHECK PASS', False, Back.GREEN) self._num_checks_passed += 1 else: self.log('CHECK FAIL', True, Back.RED) self.log("%.4f seconds" % (execution_time)) else: self.log('CHECK FAIL', True, Back.RED) self._num_checks += 1 @staticmethod def run_all(): """ Run all registered test suites that have run """ ExternalProgramTestSuite._has_run = False for suite, properties in ExternalProgramTestSuite._test_suites.items(): try: ExternalProgramTestSuite.run(properties['self'], properties['name']) except Exception as e: properties['self'].log('[%s] %s' % (type(e).__name__, e), True, Fore.RED) # print test result properties['self']._print_suite_results() ExternalProgramTestSuite.print_total_results() @staticmethod def print_total_results(): """ Print the cumulative results from all suites registered and run """ # print results for each suite on one line # keep track of test results info for totals total_num_tests = 0 total_num_passed = 0 total_checks = 0 total_checks_passed = 0 total_suites_passed = 0 total_num_suites = 0 total_execution_time = 0 try: for index, (suite, results) in enumerate( ExternalProgramTestSuite._test_suites.items()): self = results['self'] if index == 0: self.log("*" * ExternalProgramTestSuite._num_formatting_chars) self.log( "ALL SUITE RESULTS", False, ExternalProgramTestSuite.suite_result_header_color) self.log("*" * ExternalProgramTestSuite._num_formatting_chars) if results['has_run']: self._print_info_and_status(suite + ": ") total_num_tests += results['num_tests'] total_num_passed += results['num_passed'] if total_num_tests > 0 and results['passed']: total_suites_passed += 1 total_checks += results['num_checks'] total_checks_passed += results['num_checks_passed'] total_execution_time += results['execution_time'] self.log("_" * ExternalProgramTestSuite._num_formatting_chars) total_num_suites += 1 # print cumulative total pass/fail if total_num_tests > 0: if total_checks > 0: percentage_checks_passed = (total_checks_passed * 1.0 / total_checks) * 100 else: percentage_checks_passed = 0 self.log("TOTALS") self.log("." * ExternalProgramTestSuite._num_formatting_chars) percentage_passed = (total_suites_passed * 1.0 / total_num_suites) * 100 self.log( "%d/%d (%.2f%%) SUITES\n%d/%d (%.2f%%) TESTS\n%d/%d (%.2f%%) CHECKS\nin %.4f seconds" % (total_suites_passed, total_num_suites, percentage_passed, total_num_passed, total_num_tests, (total_num_passed * 1.0 / total_num_tests) * 100, total_checks_passed, total_checks, percentage_checks_passed, total_execution_time)) if percentage_passed == 100: self.log("OK", False, Back.GREEN) else: self.log("NOT OK", False, Back.RED) self.log("." * ExternalProgramTestSuite._num_formatting_chars) except Exception as e: print(Fore.RED + '[%s] %s' % (type(e).__name__, e) + Fore.RESET + Back.RESET + Style.RESET_ALL)
def diffList(left, right, path, result): for x in range(len(left)): path2 = path + '[' + str(x) + ']' if x >= len(right): result['missingOnRight'].append(path2) else: diffValue(left[x], right[x], path2, result) for x in range(len(left), len(right)): path2 = path + '[' + str(x) + ']' result['missingOnLeft'].append(path2) # --------------------------------------------------------------------------------- ALLOWED_MISSING_ON_RIGHT = Set([".version", ".policyType", ".guid"]) def isPolicyIdentical(old, new): result = digdiff(old, new) #misc.ppprint(old) #misc.ppprint(new) debug("missingOnLeft:{}".format(result['missingOnLeft'])) debug("missingOnRight:{}".format(result['missingOnRight'])) debug("differsByType:{}".format(result['differsByType'])) debug("differsByValue:{}".format(result['differsByValue'])) if len(result['missingOnLeft']) > 0 or len( result['differsByType']) > 0 or len(result['differsByValue']) > 0: return False else: for missing in result["missingOnRight"]:
from collections import defaultdict from sets import Set pos_seedlist=["good", "nice", "love", "excellent", "fortunate", "correct", "superior"] neg_seedlist=["bad", "nasty", "poor", "hate", "unfortunate", "wrong", "inferior"] sentences=open('/home/twinkle/NLP/hw3/tweets.txt').read().strip().split("\n") #one row #condition to ignore wordcounts=defaultdict(int) pair_counts=defaultdict(int) seedsum=defaultdict(float) pmi=defaultdict(float) polarity=defaultdict(float) total=0 bow=Set() words=[] allwords=[] #make a set, wordsum, break loop for both, i=0 i=0 #bla=[] for sent in sentences: print i i+=1 temp=sent.split(' ') words.append(list(Set(temp))) for row in words:
def filter_emojis(text): return Set(text.replace(':', '')).issubset(allowed_chars) and len(text) > 2
import sys from sets import Set import numpy fdata = open(sys.argv[1]) fcluster = open(sys.argv[2]) fout = open(sys.argv[3], "w") output_type = int(sys.argv[4]) uid_cid = {} for line in fcluster: uid = int(line.split(" ")[0]) cid = int(line.split(" ")[1]) uid_cid[uid] = cid doc_set = Set() cluster_ctr = {} for line in fdata: line = line[:-1] line_arr = line.split("|") user_id = int(line_arr[1]) if user_id not in uid_cid: continue cluster_id = uid_cid[user_id] shown_doc = line_arr[0].split(" ")[1] clicked = int(line_arr[0].split(" ")[2]) doc_set.add(shown_doc) if cluster_id not in cluster_ctr: cluster_ctr[cluster_id] = {}
class AggDC: PassThrough = Set( ''' BeginDrawing EndDrawing GetBackground GetSize GetSizeTuple SetBrush SetPen '''.split() ) def __init__(self, dc): self.dc = dc self.dc.BeginDrawing() w, h = self.dc.GetSizeTuple() self.draw = aggdraw.Draw('RGB', (w, h)) self.draw.rectangle((0, 0, w, h), None, aggBrush(dc.GetBackground())) def __del__(self): w, h = self.dc.GetSizeTuple() if w and h: image = wx.EmptyImage(w, h) image.SetData(self.draw.tostring()) self.dc.DrawBitmap(image.ConvertToBitmap(), 0, 0) self.dc.EndDrawing() def __getattr__(self, attr): if attr in self.PassThrough: return getattr(self.dc, attr) else: raise AttributeError("%s instance has no attribute '%s'" % (self.__class__.__name__, attr)) def CrossHair(self, x, y): #self.dc.CrossHair(x, y) w, h = self.dc.GetSizeTuple() p = aggPen(self.dc.GetPen()) self.draw.line((0, y, w, y), p) self.draw.line((x, 0, x, h), p) def DrawArc(self, x1, y1, x2, y2, xc, yc): #self.dc.DrawArc(x1, y1, x2, y2, xc, yc) b = aggBrush(self.dc.GetBrush()) p = aggPen(self.dc.GetPen()) radius = ((xc-x1)**2 + (yc-y1)**2)**0.5 self.draw.pieslice( (xc-radius, yc-radius, xc+radius, yc+radius), math.degrees(math.atan2(yc-y1, x1-xc)), math.degrees(math.atan2(yc-y2, x2-xc)), p, b ) def DrawCircle(self, x, y, radius): #self.dc.DrawCircle(x, y, radius) b = aggBrush(self.dc.GetBrush()) p = aggPen(self.dc.GetPen()) self.draw.ellipse((x-radius, y-radius, x+radius, y+radius), p, b) def DrawEllipse(self, x, y, width, height): #self.dc.DrawEllipse(x, y, width, height) b = aggBrush(self.dc.GetBrush()) p = aggPen(self.dc.GetPen()) self.draw.ellipse((x, y, x+width, y+height), p, b) def DrawLine(self, x1, y1, x2, y2): #self.dc.DrawLine(x1, y1, x2, y2) p = aggPen(self.dc.GetPen()) self.draw.line((x1, y1, x2, y2), p) def DrawRectangle(self, x, y, width, height): #self.dc.DrawRectangle(x, y, width, height) b = aggBrush(self.dc.GetBrush()) p = aggPen(self.dc.GetPen()) self.draw.rectangle((x, y, x+width, y+height), p, b) def DrawPolygon(self, points): #self.dc.DrawPolygon(points) n = [] for p in points: n += p b = aggBrush(self.dc.GetBrush()) p = aggPen(self.dc.GetPen()) self.draw.polygon(n, p, b)
def find_certain_child(certain, uncertain, possibles): li = [] for name in Set(certain): min_ = minCount(possibles, name) - uncertain.count(name) li.extend(min_ * [name]) return li
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type == 1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info( curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type == 2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index( curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info( curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free( data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs( data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [ strain_id2acc[strain_id] for strain_id in strain_id_list ] category_list = [ strain_id2category[strain_id] for strain_id in strain_id_list ] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[ strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type == 1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
def process_line(task_str, task_data_str): # Ignore any responses that make it into the task string task_str = mass_str_replace(task_str, response_strs, '') # Process task_data_str into component bits # For all tasks except learning task, extract spaun's answer if task_str in ['A0', 'A1', 'A3', 'A4', 'A5', 'A6', 'A7']: # Split the task data string into before and after the question mark task_data_split = task_data_str.split('?', 1) # The task information is before the question mark task_info = task_data_split[0].replace("'", '') # Filter out the MNIST digits task_info = remove_MNIST_strs(task_info) # Record special characters has_F = 'F' in task_info has_R = 'R' in task_info has_P = 'P' in task_info has_K = 'K' in task_info # Split up the different components of the task info task_info_split = task_info.split(']') if task_info_split[-1] == '': task_info_split = task_info_split[:-1] # Remove [ ]'s and special characters from each part of task_info_split for i in range(len(task_info_split)): task_info_split[i] = \ mass_str_replace(task_info_split[i], ['[', ']', 'F', 'R', 'P', 'K', '-'], '') # Spaun's answer is after the question mark task_answer_spaun = \ np.array(list(mass_str_replace(task_data_split[1], response_strs, num_list_strs))) if len(task_answer_spaun) == 0: return (None, None) # ------ Reference answer generation ------ if task_str in ['A0', 'A1', 'A3']: # For copy-draw, classification, memory task task_info = np.array(list(task_info_split[0])) if has_R: task_answer_ref = task_info[-1::-1] else: task_answer_ref = task_info elif task_str == 'A4': # For counting tasks start_num = int(task_info_split[0]) count_num = int(task_info_split[1]) ans_num = start_num + count_num # Ignore invalid task options if ans_num > 9: task_str = 'INVALID' warn('A4: Computed answer > 9') task_answer_ref = np.array([str(ans_num)]) elif task_str == 'A5': # QA task num_list = map(int, list(task_info_split[0])) probe_num = int(task_info_split[1]) if has_P: task_answer_ref = np.array([str(num_list[probe_num - 1])]) elif has_K: task_answer_ref = np.array([str(num_list.index(probe_num) + 1)]) else: task_str = 'INVALID' warn('A5: No valid P/K for QA task') elif task_str == 'A6': from sets import Set # RVC task if len(task_info_split) % 2: match_list = None for i in range(len(task_info_split) / 2): list1 = np.array(list(task_info_split[i * 2])) list2 = np.array(list(task_info_split[i * 2 + 1])) if match_list is None: match_list = [ Set(np.where(list1 == item)[0]) for item in list2 ] else: # TODO: Check for inconsistencies across pairs if len(list2) != len(match_list): warn('A6: Inconsistent RVC ref answer lengths.') task_str = 'INVALID' else: match_list = [ match_list[j] & Set(np.where(list1 == list2[j])[0]) for j in range(len(match_list)) ] list1 = np.array(list(task_info_split[-1])) task_answer_ref = np.array( [list1[list(set_list)[0]] for set_list in match_list]) else: task_str = 'INVALID' warn('A6: Invalid RVC task. No question list given.') elif task_str == 'A7': # Raven's induction task # Induction task comes in 3 forms: changing list len, and changing # number relations, identical lists col_count = 1 induction_diff = None induction_len_change = None induction_identity = None for i in range(1, len(task_info_split)): if col_count % 3 == 0: col_count += 1 continue list1 = map(int, np.array(list(task_info_split[i - 1]))) list2 = map(int, np.array(list(task_info_split[i]))) # Handle the following cases: # 1. Unchanging list lengths of len 1 if len(list1) == len(list2) == 1: diff = list2[0] - list1[0] if induction_diff is None: induction_diff = diff if induction_diff != diff: warn('A7: Inconsistent change between induction items') task_str = 'INVALID' # 2. Changing list lengths, but containing identical items elif (list1[0] == list2[0]) and (len(list1) != len(list2)): len_change = len(list2) - len(list1) if induction_len_change is None: induction_len_change = len_change if induction_len_change != len_change: warn('A7: Inconsistent change between list lenghts') task_str = 'INVALID' elif (len(list1) == len(list2)) and (list1 == list2): induction_identity = True else: warn('A7: Unhandled induction task type') task_str = 'INVALID' # Handle transition to next row col_count += 1 def spaun_response_to_int(c): return int(c) if c.isdigit() else -1 list1 = map(spaun_response_to_int, list(task_info_split[-1])) if induction_diff is not None and induction_len_change is None and \ induction_identity is None: task_answer_ref = np.array(map(str, [list1[0] + induction_diff])) elif (induction_len_change is not None and induction_diff is None and induction_identity is None): task_answer_ref = np.array( map(str, [list1[0]] * (len(list1) + len_change))) elif (induction_len_change is None and induction_diff is None and induction_identity is not None): task_answer_ref = np.array(map(str, list1)) else: warn('A7: Multiple induction types encountered?') task_str = 'INVALID' # Format the task answer list (make the same length as the reference # answer list). Applies to all but learning task if task_str == 'INVALID': return task_str, np.array([0]) if task_str in ['A0', 'A1', 'A3', 'A4', 'A5', 'A6', 'A7']: task_answer = np.chararray(task_answer_ref.shape) task_answer[:] = '' task_answer_len = min(len(task_answer_ref), len(task_answer_spaun)) task_answer[:task_answer_len] = task_answer_spaun[:task_answer_len] # DEBUG # print task_data_str, task_answer, task_answer_ref else: print task_data_str if task_str in ['A0', 'A1', 'A3']: # For memory, recognition, copy drawing tasks, check recall accuracy # per item return ('_'.join([task_str, str(len(task_answer_ref))]), map(int, task_answer == task_answer_ref)) if task_str in ['A4', 'A5', 'A6', 'A7']: # For other non-learning tasks, check accuracy as wholesale correct / # incorrect if task_answer[0] == '-': return (None, None) return ('_'.join([task_str, str(len(task_answer_ref))]), [int(np.all(task_answer == task_answer_ref))])
def read_hyperion_config(file_path): """ Parses hyperion config file. """ with open(file_path) as hyperion_config_json: config = commentjson.load(hyperion_config_json) leds = [] x_coords = [] y_coords = [] for led in config.get('leds', []): hscan = led['hscan'] vscan = led['vscan'] hmin = hscan['minimum'] hmax = hscan['maximum'] vmin = vscan['minimum'] vmax = vscan['maximum'] h_center = round(((hmin + hmax) / 2) * 100, 2) v_center = round(((vmin + vmax) / 2) * 100, 2) x_coords.append(h_center) y_coords.append(v_center) leds.append({'x': h_center, 'y': v_center}) xcounts = [] left = None right = None for x in Set(x_coords): xcounts.append({'x': x, 'count': x_coords.count(x)}) if len(dict( (xcount['count'], xcount) for xcount in xcounts).values()) > 1: # Position might not be minimum for TV setups xcounts.sort(key=operator.itemgetter('count')) right = xcounts[len(xcounts) - 2] left = xcounts[len(xcounts) - 1] else: # Position should be minimum for matrix setups xcounts.sort(key=operator.itemgetter('x')) right = xcounts[len(xcounts) - 1] left = xcounts[0] if right['x'] < left['x']: left, right = right, left ycounts = [] top = None bottom = None for y in Set(y_coords): ycounts.append({'y': y, 'count': y_coords.count(y)}) if len(dict( (ycount['count'], ycount) for ycount in ycounts).values()) > 1: # Position might not be minimum for TV setups ycounts.sort(key=operator.itemgetter('count')) bottom = ycounts[len(ycounts) - 2] top = ycounts[len(ycounts) - 1] else: # Position should be minimum for matrix setups ycounts.sort(key=operator.itemgetter('y')) bottom = ycounts[len(ycounts) - 1] top = ycounts[0] if bottom['y'] < top['y']: top, bottom = bottom, top leds_left = [] leds_right = [] leds_top = [] leds_bottom = [] for i, led in enumerate(leds): x = led['x'] y = led['y'] if x == left['x']: leds_left.append(i) elif x == right['x']: leds_right.append(i) elif y == top['y']: leds_top.append(i) elif y == bottom['y']: leds_bottom.append(i) # Sort the lists leds_top.sort(key=lambda i: leds[i]['x'], reverse=False) leds_right.sort(key=lambda i: leds[i]['y'], reverse=False) leds_bottom.sort(key=lambda i: leds[i]['x'], reverse=True) leds_left.sort(key=lambda i: leds[i]['y'], reverse=True) # Not the lists run like this: # >>>>>>> TOP >>>>>>> # ^ v # ^ v # LEFT RIGHT # ^ v # ^ v # <<<<< BOTTOM <<<<<< # print 'leds_top: {}'.format(leds_top) # print 'leds_right: {}'.format(leds_right) # print 'leds_bottom: {}'.format(leds_bottom) # print 'leds_left: {}'.format(leds_left) return (leds, leds_top, leds_right, leds_bottom, leds_left)
effectivenessMap = { -2 : 0.51, -1 : 0.714, 0 : 1, 1 : 1.4 } attributeNum = Set([ "height", "weight", "number", "maxcp", "attack", "defense", "stamina", "damage", "energy", "energy gain", "dps", "eps", "cooldown", "activation", "bars" ]) def parsePokemon(attributes, poke): global pokemon current = {} delta = 0 for i in range(0, len(attributes)): attr = attributes[i]
def get_routes(): routes = None valid_trips = None n = 1 try: n = int(request.args.get('next', 1)) except ValueError: return jsonify({ '404' : 'Cannot parse \'next\' parameter'}), 404 if len(request.args.keys()) > 0: # filter routes by the provided URL parameters lat1 = request.args.get('lat1', 999) lon1 = request.args.get('lon1', 999) lat2 = request.args.get('lat2', 999) lon2 = request.args.get('lon2', 999) if lat1 == 999 or lon1 == 999 or lat2 == 999 or lon2 == 999: # the parameters provided cannot be used to filter, so return error return jsonify({ '404' : 'Bad URL Parameters'}), 404 else: stop_times = [] start = decode(request.args.get('start', '')) stop = decode(request.args.get('stop', '')) if len(stop) > 0 and len(start) == 0: # the parameters provided cannot be used to filter, so return error return jsonify({ '404' : 'Cannot have end time without start time'}), 404 elif len(start) == 0 and len(stop) == 0: # filter by latitude and longitude only stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2) else: start_time = None stop_time = None try: start_time = gtfs_parser.datetime_from_string(start) if len(stop) > 0: stop_time = gtfs_parser.datetime_from_string(stop) except: return jsonify({ '404' : 'Cannot parse time'}), 404 if not stop_time is None: # filter within a range of time stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2, models.StopTime.arrival_time >= start_time, models.StopTime.departure_time <= stop_time) else: # filter from initial time only stop_times = models.StopTime.query.filter(models.StopTime.stop_lon >= lon1, models.StopTime.stop_lon <= lon2, models.StopTime.stop_lat >= lat1, models.StopTime.stop_lat <= lat2, models.StopTime.arrival_time >= start_time) stop_times = array_from_query(stop_times) stop_times.sort(key = lambda st: st.arrival_time, reverse = False) trips = [] for stop_time in stop_times: trips.append(stop_time.trip) trips = unique_array(trips) filtered_routes = Set() for trip in trips: filtered_routes.add(trip.route) routes = filtered_routes valid_trips = trips else: # otherwise, no URL parameters are provided, so return all routes routes = models.Route.query.all() return jsonify({ 'routes' : [r.serialize(valid_trips, n) for r in routes] })
def initialize(self, opt): BaseModel.initialize(self, opt) if opt.resize_or_crop != 'none' or not opt.isTrain: # when training at full res this causes OOM torch.backends.cudnn.benchmark = True self.isTrain = opt.isTrain self.use_features = opt.instance_feat or opt.label_feat self.gen_features = self.use_features and not self.opt.load_features input_nc = opt.label_nc if opt.label_nc != 0 else opt.input_nc ##### define networks # Generator network netG_input_nc = input_nc + opt.otherInfo_nc if not opt.no_instance: netG_input_nc += 1 if self.use_features: netG_input_nc += opt.feat_num self.netG = networks.define_G(netG_input_nc, opt.output_nc, opt.ngf, opt.netG, opt.n_downsample_global, opt.n_blocks_global, opt.n_local_enhancers, opt.n_blocks_local, opt.norm, gpu_ids=self.gpu_ids) # Discriminator network if self.isTrain: use_sigmoid = opt.no_lsgan netD_input_nc = input_nc + opt.output_nc if not opt.no_instance: netD_input_nc += 1 self.netD = networks.define_D(netD_input_nc, opt.ndf, opt.n_layers_D, opt.norm, use_sigmoid, opt.num_D, not opt.no_ganFeat_loss, gpu_ids=self.gpu_ids) ### Encoder network if self.gen_features: self.netE = networks.define_G(opt.output_nc, opt.feat_num, opt.nef, 'encoder', opt.n_downsample_E, norm=opt.norm, gpu_ids=self.gpu_ids) if self.opt.verbose: print('---------- Networks initialized -------------') # Preprocessor network self.netP = networks.define_P(opt.otherInfoTotalSize, opt.otherInfo_nc) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.netP.to(device) # load networks if not self.isTrain or opt.continue_train or opt.load_pretrain: pretrained_path = '' if not self.isTrain else opt.load_pretrain self.load_network(self.netG, 'G', opt.which_epoch, pretrained_path) self.load_network(self.netP, 'P', opt.which_epoch, pretrained_path) if self.isTrain: self.load_network(self.netD, 'D', opt.which_epoch, pretrained_path) if self.gen_features: self.load_network(self.netE, 'E', opt.which_epoch, pretrained_path) # set loss functions and optimizers if self.isTrain: if opt.pool_size > 0 and (len(self.gpu_ids)) > 1: raise NotImplementedError( "Fake Pool Not Implemented for MultiGPU") self.fake_pool = ImagePool(opt.pool_size) self.old_lr = opt.lr # define loss functions self.loss_filter = self.init_loss_filter(not opt.no_ganFeat_loss, not opt.no_vgg_loss, not opt.no_smooth_loss, not opt.no_nonzero_loss) self.criterionGAN = networks.GANLoss(use_lsgan=not opt.no_lsgan, tensor=self.Tensor) self.criterionFeat = torch.nn.L1Loss() if not opt.no_vgg_loss: self.criterionVGG = networks.VGGLoss(self.gpu_ids) if not opt.no_smooth_loss: self.criterionSmooth = networks.SmoothLoss(self.gpu_ids) if not opt.no_nonzero_loss: self.criterionNonzero = networks.NonzeroLoss(self.gpu_ids) # Names so we can breakout loss self.loss_names = self.loss_filter('G_GAN', 'G_GAN_Feat', 'G_VGG', 'G_smooth', 'G_nonzero', 'D_real', 'D_fake') # initialize optimizers # optimizer G if opt.niter_fix_global > 0: import sys if sys.version_info >= (3, 0): finetune_list = set() else: from sets import Set finetune_list = Set() params_dict = dict(self.netG.named_parameters()) params = [] for key, value in params_dict.items(): if key.startswith('model' + str(opt.n_local_enhancers)): params += [value] finetune_list.add(key.split('.')[0]) print( '------------- Only training the local enhancer network (for %d epochs) ------------' % opt.niter_fix_global) print('The layers that are finetuned are ', sorted(finetune_list)) else: params = list(self.netG.parameters()) if self.gen_features: params += list(self.netE.parameters()) self.optimizer_G = torch.optim.Adam(params, lr=opt.lr, betas=(opt.beta1, 0.999)) # optimizer D params = list(self.netD.parameters()) self.optimizer_D = torch.optim.Adam(params, lr=opt.lr, betas=(opt.beta1, 0.999))