def adjusted_rand_index():
    #The text file is updated by a stream of data
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost")
    inputf1 = Streaming_AbstractGenerator.StreamAbsGen(
        "TextHistogramPartition", [
            "/var/log/kern.log", "/var/log/syslog", "/var/log/ufw.log",
            "/var/log/dmesg", "/var/log/kern.log"
        ])
    histograms = []
    for p in inputf1:
        histograms.append(p)
    ari = adjusted_rand_score(
        tocluster(histograms[0], "Text")[:20000],
        tocluster(histograms[1], "Text")[:20000])
    print "Adjusted Rand Index of first two histogram set partitions(truncated):", ari
    prev = 0
    for n in range(1, len(histograms)):
        truncatedlen = int(
            min(len(histograms[prev]), len(histograms[n])) * 0.9)
        ari = adjusted_rand_score(
            tocluster(histograms[prev], "Text")[:truncatedlen],
            tocluster(histograms[n], "Text")[:truncatedlen])
        print "Adjusted Rand Index(truncated):", ari
        ami = adjusted_mutual_info_score(
            tocluster(histograms[prev], "Text")[:truncatedlen],
            tocluster(histograms[n], "Text")[:truncatedlen])
        print "Adjusted Mutual Info Index(truncated):", ami
        prev = n
    #################################################################
    histograms = []
    inputf2 = Streaming_AbstractGenerator.StreamAbsGen(
        "DictionaryHistogramPartition", "Streaming_SetPartitionAnalytics.txt")
    for p in inputf2:
        histograms.append(p)
    prev = 0
    print "histograms:", histograms
    for n in range(1, len(histograms)):
        truncatedlen = int(
            min(len(histograms[prev]), len(histograms[n])) * 0.9)
        ari = adjusted_rand_score(
            tocluster(histograms[prev], "Dict")[:truncatedlen],
            tocluster(histograms[n], "Dict")[:truncatedlen])
        print "Adjusted Rand Index (truncated):", ari
        ami = adjusted_mutual_info_score(
            tocluster(histograms[prev], "Dict")[:truncatedlen],
            tocluster(histograms[n], "Dict")[:truncatedlen])
        print "Adjusted Mutual Info Index (truncated):", ami
        prev = n
Exemplo n.º 2
0
def publish(datasource="USB"):
    p = Producer({'bootstrap.servers': 'localhost:9092'})
    if datasource == "USB":
        inputf = Streaming_AbstractGenerator.StreamAbsGen(
            "USBWWAN_stream", "USBWWAN")
    else:
        inputf = Streaming_AbstractGenerator.StreamAbsGen(
            "KingCobra", "KingCobra")
    for data in inputf:
        print "publishing data:", data, " in Kafka Topic"
        try:
            p.produce('neuronraindata', data.encode('utf-8'))
        except:
            pass
def distinct_elements(pattern=None):
    #The text file is updated by a stream of data
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
    inputf = Streaming_AbstractGenerator.StreamAbsGen("file",
                                                      "StreamingData.txt")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
    #inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost")
    supersetsize = 0
    randomsubset = []
    if pattern is None:
        for x in inputf:
            randint = random.randint(1, 2)
            supersetsize += 1
            if randint == 1:
                randomsubset.append(x)
    else:
        for x in inputf:
            supersetsize += 1
            if x.strip() == pattern.strip():
                randomsubset.append(x)
    minimum = min(randomsubset)
    print "minimum:", minimum
    size = (supersetsize / float(minimum.strip())) - 1
    return size
Exemplo n.º 4
0
def approximate_counting(pattern):
	#The text file is updated by a stream of data
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
	inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
	#inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost")
	k=0
	for i in inputf:
		if pattern.strip() == i.strip() :
			k += probabilistic_increment(k)
	count=math.pow(2,k) - 1
	print "approximate_counting(): count = ",count
	return count 
	hash=(int(string_hash_code(inp),16)*hashfn_index) % bloomfiltersize 
	#hash=(int(binascii.hexlify(inp),16)*hashfn_index) % bloomfiltersize 
	#print "hash for [",inp,"] :",hash
	return hash

bloomfiltersize=10000
no_of_hashfns=50
bloom_bitset=[]

for i in xrange(bloomfiltersize):
	bloom_bitset.append(0)


#inputf=open("StreamingData.txt","r")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
for i in inputf:
	for k in xrange(no_of_hashfns):
		bloom_bitset[getHashedLocation(i,k)]=1
print bloom_bitset

#sample queries from the input stream and not in input stream

#for file storage
#query=["osoioiiee" ,"73885.399249226" ,"2292179968"]

#for HBase storage
#query=["osoioiiee" ,"880130065\x0A", "875310463\x0A"]

#for USBWWAN stream storage
countminsketch = []
rowvector = []
estimator = []
for n in xrange(columns):
    rowvector.append(0)
for m in xrange(rows):
    countminsketch.append(rowvector)
    estimator.append(0)
#print countminsketch

#inputf=open("StreamingData.txt","r")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("file","file")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("Kafka","Kafka")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
inputf = Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings",
                                                  "NeuronRain")
#add and populate sketch
for i in inputf:
    for row in xrange(rows):
        column = getHash(i, row, a, b) % columns
        countminsketch[row][column] += 1
        no_of_elements_added += 1
    row = 0
print countminsketch

#frequencies of each input - minimum of all hashed cells
no_of_elements_estimated = 0
no_of_elements_exact = 0
minsketch_dict = {}

for i in inputf:
Exemplo n.º 7
0
 def __init__(self):
     #self.inputstream = Streaming_AbstractGenerator.StreamAbsGen("file","file")
     self.inputstream = Streaming_AbstractGenerator.StreamAbsGen(
         "Spark_Parquet", "Spark_Streaming")
     self.counter = 0
     self.element = ""
Exemplo n.º 8
0
                suffix - self.maxnumberdigits].query_nearest_neighbours(
                    nstr[(suffix - self.maxnumberdigits):])
            print "match=", match
            for x in match[0][1]:
                if x == nstr[(suffix - self.maxnumberdigits):]:
                    print "substringmatch=True"
                    substringmatch = True
            exists = exists & substringmatch
            substringmatch = False
            cnt += 1
        return exists


if __name__ == "__main__":
    #primesf=[2,3,5,7,11,13,17,19,23,29,31,37,41,43]
    primesf = Streaming_AbstractGenerator.StreamAbsGen("file",
                                                       "First100Primes.txt")
    unsorted = UnsortedSearch(primesf)
    unsorted.create_prefix_suffix_hashtables()
    #unsorted.print_unsorted_ntuples_hash()
    unsorted.print_digit_hashtables()
    print "======================================================"
    exists = unsorted.search_number(99455)
    print "Is Queried integer 99455 in unsorted array:", exists
    print "======================================================"
    exists = unsorted.search_number(43)
    print "Is Queried integer 43 in unsorted array:", exists
    print "======================================================"
    exists = unsorted.search_number(31)
    print "Is Queried integer 31 in unsorted array:", exists
    print "======================================================"
    exists = unsorted.search_number(17)
Exemplo n.º 9
0
    return rank


estimators = {}
for n in xrange(no_of_buckets):
    estimators[n] = 0

#The text file is updated by a stream of data
line = 0
cardinality = []
#inputf=open("StreamingData.txt","r")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("file","file")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming")
#inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain")
inputf = Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming",
                                                  "localhost")
for i in inputf:
    line = line + 1
    if line < 20:
        #split into subsets of 20 each and compute similar to LogLog counter
        print "######################################"
        print i
        hashstring = getHash(i)
        hashlen = len(hashstring)
        k = int(math.log(no_of_buckets, 2))
        print "k=", k
        bucket = int(hashstring[0:k], 2)
        print "bucket=", bucket
        estimators[bucket] = max(estimators[bucket],
                                 getRank(hashstring[k + 1:]))
        print "estimators[bucket] = ", estimators[bucket]
Exemplo n.º 10
0
            return "High"
        elif mean > 0.5:
            return "Normal"
        elif mean > 0.4:
            return "Medium"
        elif mean > 0.3:
            return "Low"
        elif mean > 0.1:
            return "Lower"
        else:
            return "Lowest"


#############################################################################################
if __name__ == "__main__":
    schedrunqstream = Streaming_AbstractGenerator.StreamAbsGen(
        "OperatingSystem", "SchedulerRunQueue")
    runqcnt = 0
    for runq in schedrunqstream:
        if runqcnt > 10:
            break
        print "Scheduler Runqueue Stream:", runq
        runqcnt += 1
    #log_mapreducer("perf.data.schedscript","sched_stat_runtime")
    kernel_analytics_conf = open("/etc/kernel_analytics.conf", "w")
    weights = [
        0.01, 0.023, 0.056, 0.043, 0.099, 0.088, 0.033, 0.021, 0.12, 0.23,
        0.34, 0.45, 0, 11, 0.56, 0.77, 0.21, 0.88, 0.92
    ]
    hiddenlayer = [0.8, 0.9, 0.3]
    inputlayer = [0.01, 0.01, 0.01]
    expectedoutput = [0.1, 0.1, 0.1]