class Newton: ''' area_ids - np.array con ides de las areas. serialized_forests - str path a carpeta con forests serializados. serialized_tree - str path a carpeta con tree serializado data_dir - str path a carpeta con datos. n_forest_results - int cantidad de resultados obtenidos por el RF k - int cantidad de vecinos cercanos calculados por el BallTree ''' def __init__(self, area_ids, serialized_forests, serialized_tree, data_dir, cache=4, n_forest_results=3, k=5): self.area_ids = area_ids self.balltree = Tree(serialized_tree, data_dir) self.n_forest_results = n_forest_results self.k = k self.serialized_forests = serialized_forests self.cache = cache self.locks = {i: Lock() for i in area_ids} self.counters = {i: 0 for i in area_ids} self.active_forests = LRU(cache, callback=lambda key, value: clear(key, value, self.locks,self.counters)) ''' area_id - int id de area para recomendar scores - np.array (n,5) arreglo de puntajes para recomendar retorna np.array (n,n_forest_results,k) carreras recomendadas ''' def get_recs(self, area_id, scores): prediction = self.predict(area_id, scores, self.n_forest_results) recommendations = [] for carreer_set in prediction: recommendations.append(self.balltree.query(carreer_set, self.k)) return np.array(recommendations) def predict(self, area_id, scores, n_results): with self.locks[area_id]: self.counters[area_id] += 1 if not self.active_forests.has_key(area_id): if get_mem_percentage() < 0.3: clear(self.active_forests.peek_last_item()[0], self.active_forests[self.active_forests.peek_last_item()[0]], self.locks,self.counters) self.active_forests[area_id] = Forest(area_id, self.serialized_forests) # print(get_mem_percentage()) forest = self.active_forests[area_id] prediction = forest.get_class(forest.query(scores, n_results)) with self.locks[area_id]: self.counters[area_id] -= 1 # print(self.active_forests.items()) return prediction def filter_recs(self, user, carreers): pass
def lru(size): hitCounter=missCounter=0 #Size in GB size = spaceLeft= int(size)*1024*1024*1024 hashmap={} cache = LRU(size) for key in trace: if key in hashmap: hitCounter+=int(dict[key]) cache[key]=dict[key] else: missCounter +=int(dict[key]) # Miss no Eviction if (int(dict[key]) <= spaceLeft): cache[key]=dict[key] hashmap[key]=dict[key] spaceLeft -= int(dict[key]) else: # Miss - Cache Eviction while(dict[key] > spaceLeft): id = cache.peek_last_item()[0] spaceLeft+=int(hashmap[id]) del cache[id] del hashmap[id] hashmap[key]=dict[key] cache[key]=dict[key] spaceLeft -= int(dict[key]) now = datetime.datetime.now() print "Hit_Counter:"+str(hitCounter)+",Miss_Counter:"+str(missCounter) print "Miss_Ratio:"+ str(float(missCounter)/ float(hitCounter+missCounter)) logging.info( str(now)[:19]+" Hit_Counter:"+str(hitCounter)+",Miss_Counter:"+str(missCounter) ) logging.info( str(now)[:19]+" Miss_Ratio:"+ str(float(missCounter)/float(hitCounter+missCounter)) )
def test_dict_behavior_matches_LRU_implementation(): lru = LRU(100) lru_sql_dict = LRUSQLDict( sqlite3.connect(":memory:"), key_encoder, key_decoder, value_encoder, value_decoder, 100, ) kv_pairs = ((to_bytes(number), number) for number in range(20)) for pair in kv_pairs: lru[pair[0]] = pair[1] lru_sql_dict[pair[0]] = pair[1] assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value) lru[to_bytes(10)] lru_sql_dict[to_bytes(10)] assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value) lru[to_bytes(15)] = 100 lru_sql_dict[to_bytes(15)] = 100 assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value) del lru[to_bytes(0)] del lru_sql_dict[to_bytes(0)] assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value) lru[to_bytes(100)] = 100 lru_sql_dict[to_bytes(100)] = 100 assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value) lru[to_bytes(5)] lru_sql_dict[to_bytes(5)] assert lru.peek_first_item() == (lru_sql_dict.head.key, lru_sql_dict.head.value) assert lru.peek_last_item() == (lru_sql_dict.tail.key, lru_sql_dict.tail.value)
def lru(ratio,output_file,data): hit=miss=0 #size = avail = float(data * ratio)/100 #divi=math.pow(2,ratio) size = avail = ratio*1024*1024*1024*1024 #size = float(data)/divi hashmap={} avail=int(avail) cache = LRU(82170872) for i in range(len(key)): if key[i] in hashmap: # if(i>14915099): # hit+=int(osize[i]) hit+=int(osize[i]) cache[key[i]]=osize[i] else: # if(i>14915099): # miss +=int(osize[i]) miss +=int(osize[i]) if (int(osize[i]) <= avail): cache[key[i]]="1" hashmap[key[i]]=int(osize[i]) avail -= int(osize[i]) else: while(int(osize[i]) > avail): id = cache.peek_last_item()[0] avail+=int(hashmap[id]) del cache[id] del hashmap[id] hashmap[key[i]]=osize[i] cache[key[i]]="1" avail -= int(osize[i]) fd = open("lru.res","a") fd.write(str(hit)+","+str(miss)+"\n") fd.close() logging.info("Hit Ratio:"+str(hit)) logging.info("Miss Ratio:"+str(miss)) # print hit, "," , miss print float(miss)/float(hit+miss)
class complex_cache: def __init__(self, size, type): # the number of items self.size = size # actual size of the cache self.lru = LRU(size) self.hits = 0.0 self.reqs = 0.0 self.cache_stack_size = 0 # how much of the cache is occupied def place(self, request): # request is a tuple (timestamp, username) self.reqs += 1 if self.lru.has_key(request[-1]): self.lru[request[-1]] = self.lru[request[-1]] + 1 self.hits += 1 else: if self.cache_stack_size + 1 > self.size: print "evict an item: "+str(self.lru.peek_last_item()) self.cache_stack_size -= 1 self.lru[request[-1]] = 1 self.cache_stack_size += 1
def test_peek_last_item(self): l = LRU(2) self.assertEqual(None, l.peek_last_item()) l[1] = '1' l[2] = '2' self.assertEqual((1, '1'), l.peek_last_item())
class DND: def __init__(self, kernel, num_neighbors, max_memory, embedding_size): # self.dictionary = LRUCache(max_memory) # self.kd_tree = kdtree.create(dimensions=embedding_size) # rnd_projection = RandomBinaryProjections("RBP", 8) # distance = EuclideanDistance() # nearest = NearestFilter(num_neighbors) # self.nearpy = Engine(dim=embedding_size, lshashes=[rnd_projection], distance=distance, vector_filters=[nearest], fetch_vector_filters=[]) self.kd_tree = None # self.data = [] # self.lshash = LSHash(hash_size=embedding_size, input_dim=embedding_size, num_hashtables=10) self.lru = LRU(size=max_memory) self.num_neighbors = num_neighbors self.kernel = kernel self.max_memory = max_memory self.embedding_size = embedding_size # self.keys_added = [] def is_present(self, key): return tuple(key) in self.lru # self.lru.has_key(tuple(key)) # return self.dictionary.get(tuple(key)) is not None # return self.dictionary.get(tuple(key.data.cpu().numpy()[0])) is not None def get_value(self, key): return self.lru[tuple(key)] # return self.dictionary.get(tuple(key)) # return self.dictionary.get(tuple(key.data.cpu().numpy()[0])) def lookup(self, lookup_key): # TODO: Speed up search knn # keys = [key[0].data for key in self.kd_tree.search_knn(lookup_key, self.num_neighbors)] lookup_key_numpy = lookup_key.data[0].numpy() # lookup_key_tuple = tuple(lookup_key_numpy) # print(lookup_key) # keys = [key[0] for key in self.lshash.query_no_data(lookup_key_numpy, num_results=self.num_neighbors)] # keys = [key[1] for key in self.nearpy.neighbours(lookup_key_numpy)] if self.kd_tree is not None: # print(len(self.lru.keys()), lookup_key_numpy) # things_distances, things_index = self.kd_tree.query(lookup_key_numpy, k=self.num_neighbors, eps=1.0) things_index = self.kd_tree.query([lookup_key_numpy], k=min(self.num_neighbors, len(self.kd_tree.data)), return_distance=False, sort_results=False) # print(things_index) keys = [self.lru.keys()[ii[0]] for ii in things_index] # print(keys) else: keys = [] # print(keys) # print(keys) # output, kernel_sum = Variable(FloatTensor([0])), Variable(FloatTensor([0])) output, kernel_sum = 0, 0 # if len(keys) != 0: # print(keys) # TODO: Speed this up since the kernel takes a significant amount of time for key in keys: # print("Key:",key, lookup_key) # if not np.allclose(key, lookup_key_numpy): #(key == lookup_key).data.all(): if not np.all(key == lookup_key_numpy): # print("Here") # gg = Variable(FloatTensor(np.array(key))) # print(key) # gg = Variable(FloatTensor(key)) gg = Variable(torch.from_numpy(np.array(key))) # print(tuple(key)) # hh = lookup_key[0] - gg # print("Key:", gg, "Lookup key", lookup_key[0]) # print(lookup_key[0] + gg) kernel_val = self.kernel(gg, lookup_key[0]) # print("key:", self.lru.get(tuple(key))) # if not self.lru.has_key(tuple(key)): # print(keys) # print(tuple(key)) # print(key in self.keys_added) # print(len(self.lru)) # if tuple(key) not in self.lru: # print("NOT IN:", tuple(key)) # print(len(keys)) output += kernel_val * self.lru.get(tuple(key)) # output += kernel_val * self.dictionary.get(tuple(key)) # print("Key", key.requires_grad, key.volatile) # print("Kernel key", self.kernel(key, lookup_key).requires_grad) # print("Output in loop", output.requires_grad) kernel_sum += kernel_val #self.kernel(key, lookup_key) # print(kernel_sum) # if len(keys) == 0: # return (lookup_key * 0)[0][0] if isinstance(kernel_sum, int): return (lookup_key * 0)[0][0] # if kernel_sum == 0: # print("0 Kernel", kernel_sum) # if len(keys) == 0: # print("0 keys", len(keys)) if kernel_sum.data[0] == 0 or len(keys) == 0: # print(lookup_key) # zeroed = (lookup_key * 0)[0][0] # print("Zero Lookup.", output.data, kernel_sum.data, len(keys)) return (lookup_key * 0)[0][0] # print("lookup_key", lookup_key.requires_grad, lookup_key.volatile) # print("kernled", self.kernel(keys[0], lookup_key).requires_grad) # print("output", output.requires_grad, output.volatile) # print("ks", kernel_sum.requires_grad, kernel_sum.volatile) # print("Non-Zero Lookup for {}".format(lookup_key)) output = output / kernel_sum # print(output) return output def upsert(self, key, value): # key = key.data[0].numpy() # print(key) # self.keys_added.append(key) # if not self.lru.has_key(tuple(key)):# self.is_present(key): # self.kd_tree.add(key) # print("Key going in", key) # self.lshash.index(input_point=key) # self.nearpy.store_vector(key, data=key) # print("Adding", tuple(key), key) # neighbours = self.nearpy.neighbours(key) # print(neighbours) self.lru[tuple(key)] = value # self.kd_tree = KDTree(data=self.lru.keys(), compact_nodes=False, copy_data=False, balanced_tree=False) self.kd_tree = KDTree(self.lru.keys()) return if len(self.lru) == self.max_memory: # Expel least recently used key from self.dictionary and self.kd_tree if memory used is at capacity # deleted_key = self.dictionary.delete_least_recently_used()[0] # deleted_key = self.lru.peek_last_item()[0] # print("Deleted key:",deleted_key) # deleted_key = np.array(deleted_key) # thing = Variable(torch.from_numpy(deleted_key).float()).unsqueeze(0) # thing = Variable(FloatTensor(deleted_key)).unsqueeze(0) # print("Thing:",thing) # print(self.dictionary.cache.keys()) key_to_delete = self.lru.peek_last_item() self.lru[tuple(key)] = value # self.kd_tree.remove(Variable(FloatTensor(deleted_key)).unsqueeze(0)) # self.kd_tree.remove(deleted_key) # Remake the LSHASH with the deleted key # print("remaking") # self.lshash = LSHash(hash_size=self.embedding_size, input_dim=self.embedding_size) # for k in self.lru.keys(): # self.lshash.index(np.array(k)) # print("Deleting", np.array(key_to_delete[0])) # self.nearpy.delete_vector(key_to_delete[0]) # self.nearpy.clean_all_buckets() # for k in self.lru.keys(): # self.nearpy.store_vector(np.array(k)) # Checking that the lru keys are the same as the keys in the lshash # for key in self.lru.keys(): # keys_close = [key[0] for key in self.lshash.query(key, num_results=5)] # # print(keys_close) # for kk in keys_close: # if kk not in self.lru: # print("\n\nProblems! Key in LSHASH not in LRU\n\n") # Check length of all lru keys # all_lru_keys = self.lshash.query(key) # print("\n", len(all_lru_keys), "\n") else: self.lru[tuple(key)] = value self.kdtree = KDTree(self.data)
class FileServer(fileService_pb2_grpc.FileserviceServicer): def __init__(self, hostname, server_port, activeNodesChecker, shardingHandler, superNodeAddress): self.serverPort = server_port self.serverAddress = hostname + ":" + server_port self.activeNodesChecker = activeNodesChecker self.shardingHandler = shardingHandler self.hostname = hostname self.lru = LRU(5) self.superNodeAddress = superNodeAddress # # This service gets invoked when user uploads a new file. # def UploadFile(self, request_iterator, context): print("Inside Server method ---------- UploadFile") data = bytes("", 'utf-8') username, filename = "", "" totalDataSize = 0 active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() # list to store the info related to file location. metaData = [] # If the node is the leader of the cluster. if (int(db.get("primaryStatus")) == 1): print("Inside primary upload") currDataSize = 0 currDataBytes = bytes("", 'utf-8') seqNo = 1 # Step 1: # Get 2 least loaded nodes based on the CPU stats. # 'Node' is where the actual data goes and 'node_replica' is where replica will go. node, node_replica = self.getLeastLoadedNode() if (node == -1): return fileService_pb2.ack( success=False, message="Error Saving File. No active nodes.") # Step 2: # Check whether file already exists, if yes then return with message 'File already exists'. for request in request_iterator: username, filename = request.username, request.filename print("Key is-----------------", username + "_" + filename) if (self.fileExists(username, filename) == 1): print("sending neg ack") return fileService_pb2.ack( success=False, message= "File already exists for this user. Please rename or delete file first." ) break # Step 3: # Make chunks of size 'UPLOAD_SHARD_SIZE' and start sending the data to the least utilized node trough gRPC streaming. currDataSize += sys.getsizeof(request.data) currDataBytes += request.data for request in request_iterator: if ((currDataSize + sys.getsizeof(request.data)) > UPLOAD_SHARD_SIZE): response = self.sendDataToDestination( currDataBytes, node, node_replica, username, filename, seqNo, active_ip_channel_dict[node]) metaData.append([node, seqNo, node_replica]) currDataBytes = request.data currDataSize = sys.getsizeof(request.data) seqNo += 1 node, node_replica = self.getLeastLoadedNode() else: currDataSize += sys.getsizeof(request.data) currDataBytes += request.data if (currDataSize > 0): response = self.sendDataToDestination( currDataBytes, node, node_replica, username, filename, seqNo, active_ip_channel_dict[node]) metaData.append([node, seqNo, node_replica]) # Step 4: # Save the metadata on the primary node after the completion of sharding. if (response.success): db.saveMetaData(username, filename, metaData) db.saveUserFile(username, filename) # Step 5: # Make a gRPC call to replicate the matadata on all the other nodes. self.saveMetadataOnAllNodes(username, filename, metaData) return fileService_pb2.ack(success=True, message="Saved") # If the node is not the leader. else: print("Saving the data on my local db") sequenceNumberOfChunk = 0 dataToBeSaved = bytes("", 'utf-8') # Gather all the data from gRPC stream for request in request_iterator: username, filename, sequenceNumberOfChunk = request.username, request.filename, request.seqNo dataToBeSaved += request.data key = username + "_" + filename + "_" + str(sequenceNumberOfChunk) # Save the data in local DB. db.setData(key, dataToBeSaved) # After saving the chunk in the local DB, make a gRPC call to save the replica of the chunk on different # node only if the replicaNode is present. if (request.replicaNode != ""): print("Sending replication to ", request.replicaNode) replica_channel = active_ip_channel_dict[request.replicaNode] t1 = Thread(target=self.replicateChunkData, args=( replica_channel, dataToBeSaved, username, filename, sequenceNumberOfChunk, )) t1.start() # stub = fileService_pb2_grpc.FileserviceStub(replica_channel) # response = stub.UploadFile(self.sendDataInStream(dataToBeSaved, username, filename, sequenceNumberOfChunk, "")) return fileService_pb2.ack(success=True, message="Saved") def replicateChunkData(self, replica_channel, dataToBeSaved, username, filename, sequenceNumberOfChunk): stub = fileService_pb2_grpc.FileserviceStub(replica_channel) response = stub.UploadFile( self.sendDataInStream(dataToBeSaved, username, filename, sequenceNumberOfChunk, "")) # This helper method is responsible for sending the data to destination node through gRPC stream. def sendDataToDestination(self, currDataBytes, node, nodeReplica, username, filename, seqNo, channel): if (node == self.serverAddress): key = username + "_" + filename + "_" + str(seqNo) db.setData(key, currDataBytes) if (nodeReplica != ""): print("Sending replication to ", nodeReplica) active_ip_channel_dict = self.activeNodesChecker.getActiveChannels( ) replica_channel = active_ip_channel_dict[nodeReplica] stub = fileService_pb2_grpc.FileserviceStub(replica_channel) response = stub.UploadFile( self.sendDataInStream(currDataBytes, username, filename, seqNo, "")) return response else: print("Sending the UPLOAD_SHARD_SIZE to node :", node) stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.UploadFile( self.sendDataInStream(currDataBytes, username, filename, seqNo, nodeReplica)) print("Response from uploadFile: ", response.message) return response # This helper method actually makes chunks of less than 4MB and streams them through gRPC. # 4 MB is the max data packet size in gRPC while sending. That's why it is necessary. def sendDataInStream(self, dataBytes, username, filename, seqNo, replicaNode): chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = dataBytes[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=username, filename=filename, data=chunk, seqNo=seqNo, replicaNode=replicaNode) # # This service gets invoked when user requests an uploaded file. # def DownloadFile(self, request, context): print("Inside Download") # If the node is the leader of the cluster. if (int(db.get("primaryStatus")) == 1): print("Inside primary download") # Check if file exists if (self.fileExists(request.username, request.filename) == 0): print("File does not exist") yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=bytes("", 'utf-8'), seqNo=0) return # If the file is present in cache then just fetch it and return. No need to go to individual node. if (self.lru.has_key(request.username + "_" + request.filename)): print("Fetching data from Cache") CHUNK_SIZE = 4000000 fileName = request.username + "_" + request.filename filePath = self.lru[fileName] outfile = os.path.join(filePath, fileName) with open(outfile, 'rb') as infile: while True: chunk = infile.read(CHUNK_SIZE) if not chunk: break yield fileService_pb2.FileData( username=request.username, filename=request.filename, data=chunk, seqNo=1) # If the file is not present in the cache, then fetch it from the individual node. else: print("Fetching the metadata") # Step 1: get metadata i.e. the location of chunks. metaData = db.parseMetaData(request.username, request.filename) print(metaData) #Step 2: make gRPC calls and get the fileData from all the nodes. downloadHelper = DownloadHelper(self.hostname, self.serverPort, self.activeNodesChecker) data = downloadHelper.getDataFromNodes(request.username, request.filename, metaData) print("Sending the data to client") #Step 3: send the file to supernode using gRPC streaming. chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = data[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=chunk, seqNo=request.seqNo) # Step 4: update the cache based on LRU(least recently used) algorithm. self.saveInCache(request.username, request.filename, data) # If the node is not the leader, then just fetch the fileChunk from the local db and stream it back to leader. else: key = request.username + "_" + request.filename + "_" + str( request.seqNo) print(key) data = db.getFileData(key) chunk_size = 4000000 start, end = 0, chunk_size while (True): chunk = data[start:end] if (len(chunk) == 0): break start = end end += chunk_size yield fileService_pb2.FileData(username=request.username, filename=request.filename, data=chunk, seqNo=request.seqNo) # This service is responsible fetching all the files. def FileList(self, request, context): print("File List Called") userFiles = db.getUserFiles(request.username) return fileService_pb2.FileListResponse(Filenames=str(userFiles)) # This helper method checks whether the file is present in db or not. def fileExists(self, username, filename): print("isFile Present", db.keyExists(username + "_" + filename)) return db.keyExists(username + "_" + filename) # This helper method returns 2 least loaded nodes from the cluster. def getLeastLoadedNode(self): print("Ready to enter sharding handler") node, node_replica = self.shardingHandler.leastUtilizedNode() print("Least loaded node is :", node) print("Replica node - ", node_replica) return node, node_replica # This helper method replicates the metadata on all nodes. def saveMetadataOnAllNodes(self, username, filename, metadata): print("saveMetadataOnAllNodes") active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() uniqueFileName = username + "_" + filename for ip, channel in active_ip_channel_dict.items(): if (self.isChannelAlive(channel)): stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.MetaDataInfo( fileService_pb2.MetaData( filename=uniqueFileName, seqValues=str(metadata).encode('utf-8'))) print(response.message) # This service is responsible for saving the metadata on local db. def MetaDataInfo(self, request, context): print("Inside Metadatainfo") fileName = request.filename seqValues = request.seqValues db.saveMetaDataOnOtherNodes(fileName, seqValues) ack_message = "Successfully saved the metadata on " + self.serverAddress return fileService_pb2.ack(success=True, message=ack_message) # This helper method checks whethere created channel is alive or not def isChannelAlive(self, channel): try: grpc.channel_ready_future(channel).result(timeout=1) except grpc.FutureTimeoutError: #print("Connection timeout. Unable to connect to port ") return False return True # This helper method is responsible for updating the cache for faster lookup. def saveInCache(self, username, filename, data): if (len(self.lru.items()) >= self.lru.get_size()): fileToDel, path = self.lru.peek_last_item() os.remove(path + "/" + fileToDel) self.lru[username + "_" + filename] = "cache" filePath = os.path.join('cache', username + "_" + filename) saveFile = open(filePath, 'wb') saveFile.write(data) saveFile.close() # This service is responsible for sending the whole cluster stats to superNode def getClusterStats(self, request, context): print("Inside getClusterStats") active_ip_channel_dict = self.activeNodesChecker.getActiveChannels() total_cpu_usage, total_disk_space, total_used_mem = 0.0, 0.0, 0.0 total_nodes = 0 for ip, channel in active_ip_channel_dict.items(): if (self.isChannelAlive(channel)): stub = heartbeat_pb2_grpc.HearBeatStub(channel) stats = stub.isAlive(heartbeat_pb2.NodeInfo(ip="", port="")) total_cpu_usage = float(stats.cpu_usage) total_disk_space = float(stats.disk_space) total_used_mem = float(stats.used_mem) total_nodes += 1 if (total_nodes == 0): return fileService_pb2.ClusterStats(cpu_usage=str(100.00), disk_space=str(100.00), used_mem=str(100.00)) return fileService_pb2.ClusterStats( cpu_usage=str(total_cpu_usage / total_nodes), disk_space=str(total_disk_space / total_nodes), used_mem=str(total_used_mem / total_nodes)) # This service is responsible for sending the leader info to superNode as soon as leader changes. def getLeaderInfo(self, request, context): channel = grpc.insecure_channel('{}'.format(self.superNodeAddress)) stub = fileService_pb2_grpc.FileserviceStub(channel) response = stub.getLeaderInfo( fileService_pb2.ClusterInfo(ip=self.hostname, port=self.serverPort, clusterName="team1")) print(response.message) # # This service gets invoked when user deletes a file. # def FileDelete(self, request, data): username = request.username filename = request.filename if (int(db.get("primaryStatus")) == 1): if (self.fileExists(username, filename) == 0): print("File does not exist") return fileService_pb2.ack(success=False, message="File does not exist") print("Fetching metadata from leader") metadata = db.parseMetaData(request.username, request.filename) print("Successfully retrieved metadata from leader") deleteHelper = DeleteHelper(self.hostname, self.serverPort, self.activeNodesChecker) deleteHelper.deleteFileChunksAndMetaFromNodes( username, filename, metadata) return fileService_pb2.ack( success=True, message="Successfully deleted file from the cluster") else: seqNo = -1 try: seqNo = request.seqNo except: return fileService_pb2.ack(success=False, message="Internal Error") metaDataKey = username + "_" + filename dataChunkKey = username + "_" + filename + "_" + str(seqNo) if (db.keyExists(metaDataKey) == 1): print("FileDelete: Deleting the metadataEntry from local db :") db.deleteEntry(metaDataKey) if (db.keyExists(dataChunkKey)): print("FileDelete: Deleting the data chunk from local db: ") db.deleteEntry(dataChunkKey) return fileService_pb2.ack( success=True, message="Successfully deleted file from the cluster") # # This service gets invoked when user wants to check if the file is present. # def FileSearch(self, request, data): username, filename = request.username, request.filename if (self.fileExists(username, filename) == 1): return fileService_pb2.ack(success=True, message="File exists in the cluster.") else: return fileService_pb2.ack( success=False, message="File does not exist in the cluster.") # # This service gets invoked when user wants to update a file. # def UpdateFile(self, request_iterator, context): username, filename = "", "" fileData = bytes("", 'utf-8') for request in request_iterator: fileData += request.data username, filename = request.username, request.filename def getFileChunks(fileData): # Maximum chunk size that can be sent CHUNK_SIZE = 4000000 outfile = os.path.join('files', fileName) sTime = time.time() while True: chunk = fileData.read(CHUNK_SIZE) if not chunk: break yield fileService_pb2.FileData(username=username, filename=fileName, data=chunk, seqNo=1) print("Time for upload= ", time.time() - sTime) if (int(db.get("primaryStatus")) == 1): channel = grpc.insecure_channel('{}'.format(self.serverAddress)) stub = fileService_pb2_grpc.FileserviceStub(channel) response1 = stub.FileDelete( fileService_pb2.FileInfo(username=userName, filename=fileName)) if (response1.success): response2 = stub.UploadFile(getFileChunks(fileData)) if (response2.success): return fileService_pb2.ack( success=True, message="File suceessfully updated.") else: return fileService_pb2.ack(success=False, message="Internal error.") else: return fileService_pb2.ack(success=False, message="Internal error.")
class Cache: # Replacement policies LRU = "LRU" FIFO = 'FIFO' def __init__(self, name, size, policy): self.name = name self.size = size self.free_space = size self.policy = policy # Eviction policy self.hashmap = {} # Mapping <objname,objsize> if (self.policy == Cache.LRU): self.cache = LRU(self.size) elif (self.policy == Cache.FIFO): self.cache = queue.Queue(maxsize=self.size) # Statistics self.hit_count = 0 self.miss_count = 0 def has_key(self, key): if key in self.hashmap.keys(): return True else: return False def update(self, key, size): self.hashmap[key] = size self.hit_count += 1 if (self.policy == Cache.LRU): self.cache.update(key=size) elif (self.policy == Cache.FIFO): self.cache.put(key) def insert(self, key, size, directory): if (self.policy == Cache.LRU): self.insertLRU(key, size, directory) elif (self.policy == Cache.FIFO): self.insertFIFO(key, size, directory) def evictLRU(self, directory): oid = self.cache.peek_last_item()[0] directory.removeBlock(oid, self.name) del [oid] del self.hashmap[oid] self.free_space += int(self.hashmap[oid]) def evictFIFO(self, directory): oid = self.cache.get() directory.removeBlock(oid, self.name) self.free_space += int(self.hashmap[oid]) del self.hashmap[oid] def insertLRU(self, key, size, directory): while (int(size) >= self.free_space): self.evictLRU(directory) self.cache[key] = size self.hashmap[key] = size self.free_space += size self.miss_count += 1 def insertFIFO(self, key, size, directory): while (int(size) >= self.free_space): self.evictFIFO(directory) self.cache.put(key) self.hashmap[key] = size self.free_space += size self.miss_count += 1 def put(self, key, size, directory): if self.has_key(key): self.update(key, size) else: self.insert(key, size, directory) def print(self): if (self.policy == Cache.LRU): print(self.name, "LRU", self.hashmap, self.cache.items()) elif (self.policy == Cache.FIFO): print(self.name, "LRU", self.hashmap, list(self.cache.queue)) def remove(self, key): del self.hashmap[key] if (self.policy == Cache.LRU): del self.cache[key] elif (self.policy == Cache.FIFO): a = 5
from lru import LRU l = LRU(5) # Create an LRU container that can hold 5 items print(l.peek_first_item(), l.peek_last_item()) #return the MRU key and LRU key # Would print None None for i in range(5): l[i] = str(i) print(l.items()) # Prints items in MRU order # Would print [(4, '4'), (3, '3'), (2, '2'), (1, '1'), (0, '0')] print(l.peek_first_item(), l.peek_last_item()) #return the MRU key and LRU key # Would print (4, '4') (0, '0') l[5] = '5' # Inserting one more item should evict the old item print(l.items()) # Would print [(5, '5'), (4, '4'), (3, '3'), (2, '2'), (1, '1')] l[3] # Accessing an item would make it MRU print(l.items()) # Would print [(3, '3'), (5, '5'), (4, '4'), (2, '2'), (1, '1')] # Now 3 is in front l.keys() # Can get keys alone in MRU order # Would print [3, 5, 4, 2, 1] del l[4] # Delete an item print(l.items()) # Would print [(3, '3'), (5, '5'), (2, '2'), (1, '1')] print(l.get_size())
class Cache: """Class representing D3N.""" # Replacement policies LRU = "LRU" LFU = "LFU" LRU_S = "LRU_S" FIFO = "FIFO" RAND = "RAND" # Write policies WRITE_BACK = "WB" WRITE_THROUGH = "WT" # Layer L1 = "L1" L2 = "L2" consistent = "consistent" rendezvous = "rendezvous" rr = "rr" def __init__(self, layer, size, replace_pol, write_pol, hash_ring, hash_type, obj_size, full_size, logger): self._replace_pol = replace_pol # Replacement policy self._write_pol = write_pol # Write policy self._layer = layer # Layer info self._size = size # Cache size self.spaceLeft = size # Cache size self._logger = logger self.hashmap = {} # Mapping self.hash_ring = hash_ring self._hash_type = hash_type self._obj_size = obj_size if (self._size == 0): self.zerosize = True self._size = 1 else: self.zerosize = False if (self._replace_pol == Cache.LRU): self.cache = LRU(self._size) elif (self._replace_pol == Cache.FIFO): self.cache = deque() elif (self._replace_pol == Cache.LRU_S): self.cache = LRU(self._size) self.shadow = LRU(full_size) self.hist = [] for i in range(full_size): self.hist.append(0) # Statistics self._hit_count = 0 self._miss_count = 0 self._backend_bw = 0 self._crossrack_bw = 0 self._intrarack_bw = 0 self.miss_lat = 0 self.lat_count = 0 def _insert1(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.shadow[key] = 1 if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def _insert(self, key, size): # No eviction if not self.zerosize: if (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) self.shadow[key] = int(size) elif (self._replace_pol == Cache.LRU): self.cache[key] = int(size) else: if (int(size) <= self.spaceLeft): if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) else: while (int(size) > self.spaceLeft): self._evict() if (self._replace_pol == Cache.LRU): self.cache[key] = int(size) elif (self._replace_pol == Cache.LRU_S): self.cache[key] = int(size) elif (self._replace_pol == Cache.FIFO): self.cache.append(key) self.hashmap[key] = int(size) self.spaceLeft -= int(size) def read1(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def read(self, key, size): if self._layer == "BE": return 1 if self.zerosize == True: return None """Read a object from the cache.""" r = None if (self._replace_pol == Cache.LRU_S): if self.cache.has_key(key): self._hit_count += 1 self.cache[key] = self.cache[key] r = 1 else: self._miss_count += 1 if self.shadow.has_key(key): count = 0 for i in self.shadow.keys(): if i == key: self.hist[count] += 1 break count += 1 self.shadow[key] = 1 else: if key in self.hashmap: if (self._replace_pol == Cache.LRU): self._update_use(key) elif (self._replace_pol == Cache.LRU_S): self._update_use(key) self._hit_count += 1 r = 1 else: self._miss_count += 1 return r def checkKey(self, key): if self._layer == "BE": return 1 if self.zerosize == True: return 0 """Read a object from the cache.""" r = 0 if (self._replace_pol == Cache.LRU_S) or (self._replace_pol == Cache.LRU): if self.cache.has_key(key): r = 1 else: r = 0 return r def _evict(self): if (self._replace_pol == Cache.LRU): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.LRU_S): id = self.cache.peek_last_item()[0] del self.cache[id] elif (self._replace_pol == Cache.FIFO): id = self.cache.popleft() self.spaceLeft += int(self.hashmap[id]) del self.hashmap[id] def _update_use(self, key): """Update the use of a cache.""" if (self._replace_pol == Cache.LRU): self.cache[key] = self.hashmap[key] if (self._replace_pol == Cache.LRU_S): self.cache[key] = self.hashmap[key] def set_cache_size(self, size): new_size = self.cache.get_size() + int(size) self.cache.set_size(int(new_size)) def set_backend_bw(self, value): self._backend_bw += value def set_crossrack_bw(self, value): self._crossrack_bw += value def set_intrarack_bw(self, value): self._intrarack_bw += value def get_backend_bw(self): return self._backend_bw def get_crossrack_bw(self): return self._crossrack_bw def get_intrarack_bw(self): return self._intrarack_bw def get_replace_pol(self): return self._replace_pol def get_hit_count(self): return self._hit_count def get_miss_count(self): return self._miss_count def get_available_space(self): return self.spaceLeft def get_replace_poll(self): return self._replace_pol def reset_shadow_cache(): self.shadow.clear() def print_cache(self): print self.cache def get_l2_address(self, key): if (self._hash_type == Cache.consistent): return self.hash_ring.get_node(key) elif (self._hash_type == Cache.rendezvous): return self.hash_ring.find_node(key) elif (self._hash_type == Cache.rr): val = key.split("_")[1] res = int(val) % int(self.hash_ring) return res
class RewardNet(): """Interacts with and learns from the environment.""" def __init__(self, state_action_size, reward_size): """Initialize an RewardNet object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_action_size = state_action_size self.reward_size = reward_size set_seed() # Reward-Network self.reward_net = Network(state_action_size, reward_size).to(device) self.optimizer = optim.Adam(self.reward_net.parameters(), lr=LR) self.criterion = nn.MSELoss() # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, 0) # Reward dict - LRFU implementation not found, therefore just LRU self.M = LRU(BUFFER_SIZE) self.S = [] self.V = 0 # Initialize loss for tracking the progress self.loss = 0 def add(self, state_action, reward): # Save experience in replay memory self.memory.add(state_action, reward) def add_to_M(self, sa, reward): # Add records to the reward dict self.M[sa] = reward if len(self.M) >= BUFFER_SIZE: del self.M[self.M.peek_last_item()[0]] # discard LRU key def get_from_M(self, sa): # Retrieve items from M return (self.M.get(sa, 0)) def step(self): # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def act(self, state_action): """Returns actions for given state as per current policy. state (array_like): current state """ sa = torch.from_numpy(state_action).float().unsqueeze(0).to(device) return (self.reward_net(sa)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. experiences (Tuple[torch.Tensor]): tuple of (sa, r) tuples """ state_actions, rewards = experiences # Get expected Reward values R_pred = self.reward_net(state_actions) # Compute loss loss = self.criterion(R_pred, rewards) print("RewardNet loss = {}".format(loss)) # Grad descent self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Keep track of the loss for the history self.loss = loss.item()
class DatabaseEnv(gym.Env): metadata = {'render.modes': ['human']} def __init__(self, args={}): super(DatabaseEnv, self).__init__() # Number of actions that the database can take # { Create View, Do nothing } N_DISCRETE_ACTIONS = 2 # Number of tables in the database being considered N_TABLES = 21 N_JOIN_COMBINATIONS = int((N_TABLES * (N_TABLES - 1)) / 2) self.database = Database() self.table_names = self.database.get_table_names_from_hive() self.join_name_mappings = self.get_mapping_for_tables(self.table_names) # Maximum number of steps in an episode N_MAX_STEPS = 5 N_MAX_JOINS = 2 # Define action and observation space # They must be gym.spaces objects self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS) self.observation_space = spaces.Box(low=0, high=1, shape=(N_JOIN_COMBINATIONS, ), dtype=np.uint8) # Capture information about episode to replay the same # on the real database self.max_steps = N_MAX_STEPS self.history = self.reset_env_history() self.current_step = 0 self.current_views = [] self.candidate_cost = 100 exclusion_list = ['schema.sql', 'fkindexes.sql'] self.queries = self.get_queries_from_dataset( '/home/richhiey/Desktop/workspace/dbse_project/Self-Driving-Materialized-Views/project/data/JOB', exclusion_list) pickle_file_path = '/home/richhiey/Desktop/workspace/dbse_project/Self-Driving-Materialized-Views/project/data/JOB/processed/job_processed.pickle' self.candidates = self.get_candidates_for_dataset(pickle_file_path) self.workload_distribution = self.get_workload_distribution( self.queries) self.current_candidate_queue = deque() self._obs_space = np.zeros(N_JOIN_COMBINATIONS) self._current_action = np.zeros(N_JOIN_COMBINATIONS) self.lru_cache_size = 20 self.lru_cache = LRU(self.lru_cache_size) def get_mapping_for_tables(self, table_names): mapping = {} names = [] for name in table_names: name = name[0] print(name) names.append(name) self.table_names = names num = 0 for i in range(len(names)): for j in range(i + 1, len(names)): join_name = names[i] + '-' + names[j] num = num + 1 mapping[num] = join_name print(mapping) return mapping def reset_env_history(self): history = {} for i in range(1, self.max_steps): history[i] = {'actions': [], 'query': ''} return history def get_workload_distribution(self, queries): # An array of the index value for weighting i = np.arange(len(queries)) # Higher weights for larger index values w = np.exp(i / 10.) # Weight must be normalized w /= w.sum() return w def get_candidates_for_query(self, query): return self.candidates['data/JOB/' + query] def get_candidates_for_dataset(self, pickle_file_path): with open(pickle_file_path, 'rb') as pickle_file: candidates = pickle.load(pickle_file) new_candidates = {} for candidate in candidates: for key, value in candidate.items(): new_candidates[key] = value return new_candidates def get_queries_from_dataset(self, dataset_path, exclusion_list): queries = [] for root, dirs, files in os.walk(dataset_path): for file in files: if file in exclusion_list: continue if '.sql' in file: queries.append(file) return queries def step(self, action): # Use the action predicted by agent to modify the # database environment and calculate reward of the action delay_modifier = (self.current_step / self.max_steps) # print(self._obs_space) print(self.current_step) if not self.current_candidate_queue: self.current_step = self.current_step + 1 self.selected_query = np.random.choice( self.queries, size=1, p=self.workload_distribution)[0] self.history[self.current_step]['query'] = self.selected_query candidates = self.get_candidates_for_query(self.selected_query) print(self.selected_query) for candidate in candidates: candidate = candidate.flatten() self.current_candidate_queue.append(candidate) current_candidate = self.current_candidate_queue.popleft() print('Action - ' + str(action)) cand_idx = np.where(current_candidate == 1)[0] print('Candidate - ' + self.join_name_mappings[int(cand_idx)]) self.lru_cache[self.selected_query] = current_candidate # Log some info about this training step self.history[self.current_step]['actions'].append({ 'action': action, 'candidate': current_candidate, 'obs_space': self._obs_space, 'eviction': self.lru_cache.peek_last_item(), }) reward, eviction = self._take_action(action, current_candidate, delay_modifier) print('Reward - ' + str(reward)) done = self.current_step >= self.max_steps if done and len(self.current_candidate_queue): reward = get_final_reward_for_episode() info = {} done = True else: done = False obs = self._next_observation() return obs, reward, done, self.history # Reset the state of the environment to an initial state def reset(self): self.history = self.reset_env_history() self.current_step = 0 self.current_views = [] self.candidate_cost = 100 self._obs_space = np.zeros(N_JOIN_COMBINATIONS) self._current_action = np.zeros(N_JOIN_COMBINATIONS) self.lru_cache = LRU(self.lru_cache_size) return self._next_observation() def render(self, mode='human', close=False): pass def _next_observation(self): return self._obs_space def env_cost_of_episode(self): run_time = 0 for step, step_history in self.history.items(): print('------------ Step - ' + str(step) + ' -------------') # First run the query and check the base cost query = step_history['query'] print(query) with open( os.path.join( '/home/richhiey/Desktop/workspace/dbse_project/Self-Driving-Materialized-Views/project/data/JOB/', query), 'r') as f: query_str = f.read() start_time = time.time() print('Actually executing on database now ..') query_output = self.database.execute_query(query_str) total_time = time.time() - start_time print('Time taken - ' + str(total_time)) run_time = run_time + total_time print('Execution done!') def get_view_creation_query(tbl_1, tbl_2): view_name = str(tbl_1) + '_' + str(tbl_2) query_str = str(tbl_1) + ' JOIN ' + str(tbl_2) + ';' query_str = query_str + "CREATE VIEW IF NOT EXISTS " + view_name + " AS " + query_str return query_str # Then run through the history and get costs for the actions # taken by the agent if len(step_history['actions']) > 0: for step in step_history['actions']: if step['action']: idx = np.where(step['candidate'] == 1) print(idx) temp = self.join_table_mapping[int(idx)].split('-') table_1 = temp[0] table_2 = temp[1] query_str = get_view_creation_query(table_1, table_2) start_time = time.time() query_output = self.database.execute_query(query_str) total_time = time.time() - start_time print('View Creation Time taken - ' + str(total_time)) run_time = run_time + total_time print('Total runtime - ' + str(run_time)) print('---------------------------------------------------') return run_time def hawc_cost_for_episode(self): return np.random.randint(0, 100) def calculate_reward_for_episode(self): initial_reward = 20 env_reward = self.env_cost_of_episode() print(env_reward) hawc_reward = self.hawc_cost_for_episode() return ((env_reward - initial_reward) / (hawc_reward - initial_reward)) * 1000 def _take_action(self, action, candidate, delay_modifier): if action: # Add the created view to the obs space # self._obs_space = np.add(self._obs_space, candidate) # Calculate reward if self.current_step < self.max_steps - 1: reward = 1 else: # - Do some magic to get cost of the queries # to calculate a useful cost for episode # - Calculate reward using that reward = self.calculate_reward_for_episode() else: # Add the created view to the obs space # Calculate reward if self.current_step < self.max_steps - 1: reward = 0 else: # - Do some magic to get cost of the queries # to calculate a useful cost for episode # - Calculate reward using that reward = self.calculate_reward_for_episode() return reward, False
class Streamer: """ streamer for flows management """ num_streamers = 0 def __init__(self, source=None, capacity=128000, active_timeout=120, inactive_timeout=60, user_metrics=None, user_classifiers=None, enable_ndpi=True): Streamer.num_streamers += 1 self.__exports = [] self.source = source self.__flows = LRU(capacity, callback=emergency_callback) # LRU cache self._capacity = self.__flows.get_size( ) # Streamer capacity (default: 128000) self.active_timeout = active_timeout # expiration active timeout self.inactive_timeout = inactive_timeout # expiration inactive timeout self.current_flows = 0 # counter for stored flows self.flows_number = 0 self.current_tick = 0 # current timestamp self.processed_packets = 0 # current timestamp # Python dictionaries to hold current and archived flow records self.flow_cache = OrderedDict() self.user_classifiers = {} if user_classifiers is not None: try: classifier_iterator = iter(user_classifiers) for classifier in classifier_iterator: if isinstance(classifier, NFStreamClassifier): self.user_classifiers[classifier.name] = classifier except TypeError: self.user_classifiers[user_classifiers.name] = user_classifiers self.user_metrics = {} if enable_ndpi: ndpi_classifier = NDPIClassifier('ndpi') self.user_classifiers[ndpi_classifier.name] = ndpi_classifier if user_metrics is not None: self.user_metrics = user_metrics def _get_capacity(self): """ getter for capacity attribute """ return self.__flows.get_size() def _set_capacity(self, new_size): """ setter for capacity size attribute """ return self.__flows.set_size(new_size) capacity = property(_get_capacity, _set_capacity) def terminate(self): """ terminate all entries in Streamer """ remaining_flows = True while remaining_flows: try: key, value = self.__flows.peek_last_item() value.export_reason = 2 self.exporter(value) except TypeError: remaining_flows = False for classifier_name, classifier in self.user_classifiers.items(): self.user_classifiers[classifier_name].on_exit() def exporter(self, flow): """ export method for a flow trigger_type:0(inactive), 1(active), 2(flush) """ # Look for the flow in the created classifiers for classifier_name, classifier in self.user_classifiers.items(): # Terminate the flow in the respective classifiers self.user_classifiers[classifier_name].on_flow_terminate(flow) # Delete the flow register from the active flows collection del self.__flows[flow.key] # Decrease the number of active flows by 1 self.current_flows -= 1 # Add the expired flow register to the final flows collection self.__exports.append(flow) def inactive_watcher(self): """ inactive expiration management """ remaining_inactives = True # While there are inactive flow registers while remaining_inactives: try: # Obtain the last flow register (Least Recently Used - LRU) in the variable value using its key key, value = self.__flows.peek_last_item() # Has the flow exceeded the inactive timeout (1 minute)? if (self.current_tick - value.end_time) >= (self.inactive_timeout * 1000): # Set export reason to 0 (inactive) in the flow value.export_reason = 0 # Export the flow to the final flows collection self.exporter(value) # There are no flows that can be declared inactive yet else: # Stop the inactive watcher until it is called again remaining_inactives = False except TypeError: remaining_inactives = False def consume(self, pkt_info): """ consume a packet and update Streamer status """ self.processed_packets += 1 # increment total processed packet counter # Obtain a flow hash key for identification of the flow key = get_flow_key(pkt_info) print("\nCONSUMING PACKET FROM FLOW:", key) # Is this packet from a registered flow? if key in self.__flows: print("FLOW FOUND - UPDATING STATISTICS") # Checking current status of the flow that the packet belongs to # -1 active flow - 0 inactive flow - 1 active flow timeout expired - 2 flush remaining flows in LRU # 3 FIN flag detected - 4 RST flag detected flow_status = self.__flows[key].update_and_check_flow_status( pkt_info, self.active_timeout, self.user_classifiers, self.user_metrics) #Has the active timeout of the flow register expired (2 minutes)? if (flow_status == 1): # Export the old flow register to the final collection and terminate this flow process on the specified classifier self.exporter(self.__flows[key]) # Create a new flow register for the current packet flow = Flow(pkt_info, self.user_classifiers, self.user_metrics, self.flow_cache) # Add the new flow to the active flows collection using the same Hash key self.__flows[flow.key] = flow # Create the entry on the flow_cache with the flow key del self.flow_cache[flow.key] self.flow_cache[flow.key] = {} # Update the flow status on the collection flow.create_new_flow_record(pkt_info, self.user_classifiers, self.user_metrics) if (flow_status == 3 ): # FIN FLAG DETECTED IN BOTH DIRECTIONS - EXPORTING FLOW self.exporter(self.__flows[key]) if ( flow_status == 4 ): # RST FLAG FOUND - UPDATING BIDIRECTIONAL STATISTICS - EXPORTING FLOW self.exporter(self.__flows[key]) if (flow_status == 5): # FIN FLAG TIMER EXPIRED self.exporter(self.__flows[key]) print("****FLOW EXPORTED") """ expired_flow = self.__flows[key] print("****STARTING TCP TIMER") threading.Timer(20, self.export_incomplete_flow(expired_flow)) """ # This packet belongs to a new flow else: # Increase the count of current active flows print("FLOW NOT FOUND - CREATING NEW FLOW REGISTER") # Update flow counters self.current_flows += 1 self.flows_number += 1 # Create the new flow object flow = Flow(pkt_info, self.user_classifiers, self.user_metrics, self.flow_cache) # Add this new flow register to the LRU self.__flows[flow.key] = flow # Create the entry on the flow_cache with the flow key self.flow_cache[flow.key] = {} # Create the new bidirectional flow record flow.create_new_flow_record(pkt_info, self.user_classifiers, self.user_metrics) # Set the current start time on the streamer timer to keep control of the inactive flows self.current_tick = flow.start_time # Remove the Least Recently Used (LRU) flow record from the active flows collection # and export it to the final flows collection if its inactive timeout has been exceeded self.inactive_watcher() print( "*******************PACKET CONSUMED - MOVING TO NEXT*********************************" ) """ def export_incomplete_flow(self, expired_flow): print("##############################---TCP TIMER EXPIRED--#######################") # Look for the flow in the created classifiers self.flows_number += 1 for classifier_name, classifier in self.user_classifiers.items(): # Terminate the flow in the respective classifiers self.user_classifiers[classifier_name].on_flow_terminate(expired_flow) self.__exports.append(expired_flow) print("##############################---EXPIRED FLOW EXPORTED-----###############################") """ def __iter__(self): # Create the packet information generator pkt_info_gen = Observer(source=self.source) # Extract each packet information from the network interface or pcap file for pkt_info in pkt_info_gen: if pkt_info is not None: # Check if the packet belongs to an existent flow or create a new one self.consume(pkt_info) for export in self.__exports: yield export self.__exports = [] # Terminate the streamer self.terminate() for export in self.__exports: yield export self.__exports = []