Exemplo n.º 1
0
def incremental_stats(fnames, err):
    hll = hyperloglog.HyperLogLog(err)  # 0.1 = 10% error
    for fname in fnames:
        n_lines = 0
        hll_local = hyperloglog.HyperLogLog(err)
        for url in read_urls(open_file(fname)):
            n_lines += 1
            hll_local.add(url)
            hll.add(url)
        print "%s\t%d\t%d\t%d" % (fname, n_lines, len(hll_local), len(hll))
    def _run_base(self,
                  n,
                  grammar_dir,
                  sample_strategy=SAMPLE_STANDARD,
                  reward=1,
                  discount=0.9):
        assert sample_strategy in [
            SAMPLE_STANDARD, SAMPLE_UNIFORM, SAMPLE_TEMPERED
        ]

        print('Running strategy: ', sample_strategy)
        e = EngineTempered(grammar_dir,
                           choice_style=sample_strategy,
                           reward=reward,
                           discount=discount)

        countMap = {}
        N = 0
        N1 = 0  # keeps track of objects seen only once
        hll_counter = hyperloglog.HyperLogLog(0.01)

        uniques = []
        goodTuringEstimates = []
        log_prod_prob = []
        newly_seen = []

        for _ in tqdm(range(n)):
            program, log_prob = self.sample(e)

            if not program in countMap:
                countMap[program] = 0
                newly_seen.append(True)
            else:
                newly_seen.append(False)

            countMap[program] += 1

            if countMap[program] == 1:
                N1 += 1
            elif countMap[program] == 2:
                N1 -= 1

            hll_counter.add(program)

            N += 1
            uniques.append(len(hll_counter))
            goodTuringEstimates.append(N1 / N)
            log_prod_prob.append(log_prob)

        print('In N={}, generated #uniq ~ {}'.format(n, len(hll_counter)))

        return dict(goodTuringEstimates=goodTuringEstimates,
                    numUniques=uniques,
                    log_prod_probs=log_prod_prob,
                    newly_seen=newly_seen)
Exemplo n.º 3
0
def analyze_pcap(diz1,diz2,diz3,list_,soglia1,soglia2,soglia3,flag,verb):

    for elem in list_:

        #Take the info about the current item of the list
        ip_s = elem[0]
        port_s = elem[1]
        ip_d = elem[2]
        port_d = elem[3]

        #Fill the first dictionary
        if(ip_s not in diz1):
            diz1[ip_s] = hyperloglog.HyperLogLog(0.05)
        else:
            diz1.get(ip_s).add(ip_d)

        #Fill the second dictionary
        if((ip_s,port_s,ip_d) not in diz2):
            diz2[(ip_s,port_s,ip_d)] = hyperloglog.HyperLogLog(0.05)
        else:
            diz2.get((ip_s,port_s,ip_d)).add(port_d)

        #Fill the third dictionary
        if((ip_s, ip_d, port_d) not in diz3):
            diz3[ip_s, ip_d, port_d] = hyperloglog.HyperLogLog(0.05)
        else:
            diz3[(ip_s,ip_d,port_d)].add(port_s)


    flag[0] = print_report(diz1,1,soglia1,flag[0],verb)
    flag[1] = print_report(diz2,2,soglia2,flag[1],verb)
    flag[2] = print_report(diz3,3,soglia3,flag[2],verb)

    #Reset dictionaries and list
    list_.clear()
    diz1.clear()
    diz2.clear()
    diz3.clear()

    return flag
Exemplo n.º 4
0
def udp_scan_detect(flow, contacted):
    global predizione
    sem.acquire()
    if flow['IPV4_SRC_ADDR'] not in contacted:
        contacted[flow['IPV4_SRC_ADDR']] = hyperloglog.HyperLogLog(0.1)
    contacted[flow['IPV4_SRC_ADDR']].add(
        str(flow['IPV4_DST_ADDR']) + ':' + str(flow["L4_DST_PORT"]))
    sem.release()
    if flow["L4_DST_PORT"] not in allowed_ports and len(
            contacted[flow['IPV4_SRC_ADDR']]) > (predizione * 1.75):
        return True
    else:
        return False
Exemplo n.º 5
0
def get_background_capacity(homology, background_list, verbose):
    total_kmers_background = hyperloglog.HyperLogLog(
        0.01)  # An HLL object with 1% estimation error
    if background_list:
        if verbose:
            print 'Estimating k-mer Storage Requirements'
        for i, seq in enumerate(background_list):
            if verbose:
                if ((i + 1) % 100) == 0:
                    print ' Estimating Background {}/{}'.format(
                        i + 1, len(background_list))
            for kmer in utils.stream_min_kmers(seq, k=homology):
                total_kmers_background.add(kmer)
    return int(math.ceil(len(total_kmers_background) * 1.1))
Exemplo n.º 6
0
    def _run_base(self,
                  n,
                  out_dir,
                  grammar_dir,
                  sample_strategy=SAMPLE_STANDARD):
        assert sample_strategy in [SAMPLE_STANDARD, SAMPLE_UNIFORM]

        e = EngineTempered(grammar_dir, choice_style=sample_strategy)
        os.makedirs(out_dir, exist_ok=True)

        countMap, rvOrderMap, labelMap = {}, {}, {}
        shardNum, shardedUnique = 0, 0

        hll_counter = hyperloglog.HyperLogLog(0.01)

        for _ in tqdm(range(n)):
            program, choices, rvOrder = self.sample(e)

            labelMap[program] = choices
            rvOrderMap[program] = rvOrder

            if not program in countMap:
                countMap[program] = 0
            countMap[program] += 1

            hll_counter.add(program)

            if len(labelMap) == SHARD_SIZE:
                # save shard and reset data dictionaries
                self.save_shard(out_dir, countMap, labelMap, rvOrderMap,
                                shardNum)

                shardNum += 1
                shardedUnique += SHARD_SIZE
                countMap = {}
                labelMap = {}
                rvOrderMap = {}

        print('In N={}, generated #uniq ~ {}'.format(n, len(hll_counter)))

        if len(countMap) > 0:  # save remaining samples
            self.save_shard(out_dir, countMap, labelMap, rvOrderMap, shardNum)
Exemplo n.º 7
0

MAX_64BIT_INT = 2**63 - 1
MAX_32BIT_INT = 2**31 - 1
MAX_16BIT_INT = 2**15 - 1

if __name__ == "__main__":
    max_val = 5 * MAX_16BIT_INT
    chunk_size = max_val // 500
    murmur64 = lambda x: mmh3.hash64(x)[0]

    methods = {
        "actual":
        DVESet(),
        "32bit hll m=256":
        hll.HyperLogLog(b=8),
        "32bit kmv k=256":
        kmv.KMinValues(k=2**8),
        "64bit kmv k=256":
        kmv.KMinValues(k=2**8, hasher=murmur64, hasher_max=MAX_64BIT_INT),
    }

    results = {f: [] for f in methods}

    x = np.arange(1, max_val, chunk_size)

    np.random.seed()
    widgets = ["Processing", Bar(), ETA()]
    _iter = iter(x)
    if ProgressBar:
        p = ProgressBar(maxval=len(x), widgets=widgets).start()
Exemplo n.º 8
0
    print('--ipv4only and --ipv6only could not be combined. PLease choose only one.')
    sys.exit()

if not inputfile == '-' and not os.path.isfile(inputfile):
    print('The inputfile does not exist')
    sys.exit()

if not os.path.isfile(whitelistfile):
    print('The whitelistfile does not exist')
    sys.exit()

# readwhitelistfile
whitelist = pickle.load(open(whitelistfile, "rb"))

# prepare HLLs
totalips = hyperloglog.HyperLogLog(hll_error_rate)
nonwhitelist = hyperloglog.HyperLogLog(hll_error_rate)
tldhll = dict()

# cache processed ips to speedup
ip_cache = dict()

if inputfile == '-':
    royparsereader = csv.reader(
        sys.stdin, delimiter=',', quoting=csv.QUOTE_MINIMAL)
else:
    royparsereader = csv.reader(
        open(inputfile), delimiter=',', quoting=csv.QUOTE_MINIMAL)

count = 0
match = 0
def update_func(new_values, state):
    state = state or hyperloglog.HyperLogLog(0.05)

    for value in new_values:
        state.add(value)
    return state
Exemplo n.º 10
0
def update_count(values, old):
    old = old or hyperloglog.HyperLogLog(0.01)
    for v in values:
        old.add(v)
    return old
Exemplo n.º 11
0
year,month,day = args.date.split('-')

delivered_log = f"/logs/archive/logs/daily/email_delivered_log/{year}/{month}/{day}/email_delivered_log-{year}{month}{day}.log.gz"

#stats = dict(
#    domain = dict(
#        newsletter_id = dict(
#            delivered = 0,
#            total_opens = 0,
#            unique_opens = hyperloglog.HyperLogLog(0.01)
#        )
#    )
#)

topstats = defaultdict( lambda: defaultdict( lambda: dict(delivered=0, total_opens=0, unique_opens=hyperloglog.HyperLogLog(0.1) ) ))
stats = defaultdict( lambda: defaultdict( lambda: defaultdict( lambda: dict(delivered=0, total_opens=0, unique_opens=hyperloglog.HyperLogLog(0.1) ) )))

delivered = gzip.open(delivered_log)

hourly_file_path = f"/logs/archive/logs/hourly/email_open_log_extended/{year}/{month}/{day}"

print(listdir(hourly_file_path))

hourly_opens = [ hourly_file_path + '/' + f for f in listdir(hourly_file_path) if isfile(join(hourly_file_path, f))]

print(hourly_opens)

count = 0
for file in hourly_opens:
    fh = gzip.open(file, mode='r')
Exemplo n.º 12
0
movies.append(len(ratings.iloc[0:5000262, 0:2].movieId.unique()))

users2 = pd.DataFrame(users)
users2.to_csv("users.csv", index=False, header=False)
movies2 = pd.DataFrame(movies)
movies2.to_csv("movies.csv", index=False, header=False)
print("\n Memory size of users list in bytes= ", len(pickle.dumps(users)))
print("\n Memory size of movies list in bytes= ", len(pickle.dumps(movies)))

#cross check the result
len(ratings.groupby("movieId"))
len(ratings.groupby("userId"))

#%% HyperLogLog

hll_u = hyperloglog.HyperLogLog(0.01)  # accept 1% counting error
hll_m = hyperloglog.HyperLogLog(0.01)

for index, row in ratings.iterrows():
    hll_m.add(row['movieId'])
    hll_u.add(row['userId'])

print()
print("Number of unique movies seen= ", len(hll_u))
print("Number of unique users rated= ", len(hll_m))

print("\n Memory size of users list in bytes= ", len(pickle.dumps(hll_u)))
print("\n Memory size of movies list in bytes= ", len(pickle.dumps(hll_m)))

#%%Sampling Methods
import json
from pandas.io.json import json_normalize
import pandas as pd
import hyperloglog

tweets = []
for line in open('data/tweets.json.1', 'r', encoding='latin-1'):
    tweets.append(json.loads(line))
print('before data')
data = json_normalize(tweets)

count_hasht = hyperloglog.HyperLogLog(0.01)  #ERROR MARGIN

for hashtags in data['entities.hashtags']:

    hashtag = json_normalize(hashtags)
    if hashtags != []:
        for index, hashtag_text in hashtag.iterrows():
            count_hasht.add(hashtag_text['text'].lower().encode("utf-8"))

print('The HyperLogLog contains ', len(count_hasht),
      ' unique items for error margin 0.01')

count_hasht = hyperloglog.HyperLogLog(0.1)  #ERROR MARGIN

for hashtags in data['entities.hashtags']:

    hashtag = json_normalize(hashtags)
    if hashtags != []:
        for index, hashtag_text in hashtag.iterrows():
            count_hasht.add(hashtag_text['text'].lower().encode("utf-8"))
Exemplo n.º 14
0
x = list(sliced((list(range(rows_number))),
                1000))  #slice the csv by 1000 lines counter

data_df = pd.read_csv('data-streaming-project.data',
                      encoding='utf-8',
                      delimiter='\t',
                      names=['user', 'movie', 'rating', 'timestamp'],
                      header=None)

start_time = time.time()

#HyperLogLog method
start_time = time.time()

hll_users = hyperloglog.HyperLogLog(0.01)  # accept 1% counting error
hll_movies = hyperloglog.HyperLogLog(0.01)  # accept 1% counting error

for list in x:  #iteration through the lists in thee counter to get each 1000 lines
    for item in list:  #iteration to the item in each list which represents each row number form the data dataframe
        hll_users.add(data_df.loc[item, 'user'])
        hll_movies.add(data_df.loc[item, 'movie'])

unique_users_hyper = len(hll_users)
unique_movies_hyper = len(hll_movies)

end_time = time.time()

print("The memory used is:", process.memory_info().rss, "bytes")  # in bytes
print("Total execution time: {}".format(end_time - start_time))
Exemplo n.º 15
0
#-
# Cardinalityusing estimation HLL
# The set is a built in data structure
#-
import hyperloglog
# Initialize
# accept 1% counting error
ds = hyperloglog.HyperLogLog(0.01)
# Add
ds.add(1)
ds.add(1)
ds.add(2)
ds.add(6)

# Test
assert len(ds) == 3
Exemplo n.º 16
0
    def _run_tempered(self, n, out_dir, grammar_dir, n_burn_in=N_BURN_IN, 
                      n_converge=N_CONVERGE, n_snapshot=N_SNAPSHOT):
        
        e = EngineTempered(grammar_dir, choice_style=SAMPLE_TEMPERED)
        os.makedirs(out_dir, exist_ok=True)

        if n_snapshot is None:
            n_snapshot = n

        hll_counter = hyperloglog.HyperLogLog(0.01)

        countMap, rvOrderMap, labelMap = {}, {}, {}
        shardNum, shardedUnique = 0, 0

        # this is number of samples we will take per snapshot
        n_per_snap = int(n // n_snapshot)

        # burn X number of samples before we start taking any samples
        if n_burn_in > 0:
            print('Burning {} samples'.format(n_burn_in))
        e.burn(n_burn_in, show_progress=True) 

        pbar = tqdm(total=n)
        for _ in range(n_snapshot):
            for _ in range(n_per_snap):
                # we froze the graph here... no additional tempering
                # -- but dont freeze graph if  we save every example
                program, choices, rvOrder = self.sample(e, freeze=n_converge > 0)

                labelMap[program] = choices
                rvOrderMap[program] = rvOrder

                if not program in countMap:
                    countMap[program] = 0
                countMap[program] += 1
                
                hll_counter.add(program)

                if len(labelMap) == SHARD_SIZE:
                    # save shard and reset data dictionaries
                    self.save_shard(out_dir, countMap, labelMap, rvOrderMap, shardNum)

                    shardNum += 1
                    shardedUnique += SHARD_SIZE
                    countMap = {}
                    labelMap = {}
                    rvOrderMap = {}

                pbar.update()

            # wait n steps before saving any more samples
            if n_converge > 0:
                print('Burning {} samples'.format(n_converge))
            e.burn(n_converge)

        print('In N={}, generated #uniq ~ {}'.format(n, len(hll_counter)))
        #pickle.dump(data, open('sample_efficiency_temp_anneal.pkl', 'wb'))

        if len(countMap) > 0:  # save remaining samples
            self.save_shard(out_dir, countMap, labelMap, rvOrderMap, shardNum)

        pbar.close()
Exemplo n.º 17
0
        
        win = win + 1    
        start = start + strmStep
        end = end  + 1000 
        n =n + strmStep
    print("Bytes for Unique Movies Dataframe: ",  winUniqueRcdB['unqMovies'].__sizeof__())
    print("Bytes for Unique Users Dataframe: ",  winUniqueRcdB['unqUsers'].__sizeof__())
#BQ2_1 Number of Unique movies & users with HyperLogLog
if BQ2_2 == 1:   
    start =0
    end = 1000
    strmStep = 1000
    errorHll = errorHll
    winUniqueRcdC = pd.DataFrame( data={'unqMovies': [], 'unqUsers':[]}, index=None, columns=['unqMovies','unqUsers'])
    winUniqueRcdC.index.name='Window'
    hllMovies = hyperloglog.HyperLogLog(errorHll)
    hllUsers = hyperloglog.HyperLogLog(errorHll)
    win =0 
    n = 0
    while n < len(dfStrPr):
        winN = 'Window_' + str(win) 
        dfStrPrWin = dfStrPr[start: end]
        for m , row in dfStrPrWin.iterrows():
            #Movies
            mov = dfStrPrWin.loc[m, "movie"] 
            hllMovies.add(str(mov))
            #Users
            usr = dfStrPrWin.loc[m, "user"] 
            hllUsers.add(str(usr))
        winUniqueRcdC.loc[winN] = [len(hllMovies), len(hllUsers)]
        
import json
import math
import hyperloglog
from pympler.asizeof import asizeof
import numpy as np

users_dict = {}
tags_dict = {}
hll_users = hyperloglog.HyperLogLog(0.05)
hll_tags = hyperloglog.HyperLogLog(0.05)

for i in range(0, 46):
    data = f'tweets.json.{i}'
    with open(data, encoding='utf-8') as json_file:
        for row in json_file:

            json_obj = json.loads(
                row)  # Make every row from json object to dictionary
            user_id = json_obj['user']['id']
            tags = json_obj['entities']['hashtags']

            for element in tags:
                for key, value in element.items():
                    if key == 'text':
                        tag = str(value)
                        if tag not in tags_dict:  # Add values to the dictionary
                            tags_dict[tag] = 1
                        else:
                            tags_dict[tag] += 1

                        hll_tags.add(str(tag))