Пример #1
0
def lemmatizeText(text):
	processedWords = {}
	wordsHashMap = {}
	words = {}
	word = ""

	for c in text:
		if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or (c == '\'') or (c == '-') or (c == '_'):
			word += c
		else:
			if word:
				word = word.lower()
				if word not in processedWords:
					if word in exceptions:
						if word in words:
							words[word] += 1
						else:
							words[word] = 1
						wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]]
					else:
						if len(word) > 3 and word not in stopWords:
							tag = nltk.pos_tag([word])              # !!! WARNING : takes A LOT OF TIME !!!
							if tag[0][1] in tags:
								if word in words:
									words[word] += 1
								else:
									words[word] = 1
								wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]]
					processedWords[word] = word
				else:
					if word in words:
						words[word] += 1
						wordsHashMap[mmh3.hash64(word)[0]] = [word, words[word]]
			word = ""
	return wordsHashMap
Пример #2
0
    def fit(self, filepath):
        print '* Langauge model fitting'
        generator = create_generator(filepath,
                                     skip_spam=True,
                                     return_set=False,
                                     STOP_CONDITION=6300)

        for doc, _ in generator:
            n_gram_cnt = len(doc) - self.N + 1
            for num in xrange(n_gram_cnt):
                n_gram_3 = mmh3.hash64(''.join(doc[num:num + self.N -
                                                   1]).encode('utf-8'))
                n_gram_4 = mmh3.hash64(''.join(doc[num:num +
                                                   self.N]).encode('utf-8'))
                self.struct_3[n_gram_3] += 1
                self.struct_4[n_gram_4] += 1
            if n_gram_cnt > 0:
                n_gram_3 = mmh3.hash64(''.join(doc[1 -
                                                   self.N:]).encode('utf-8'))
                self.struct_3[n_gram_3] += 1

        summary_cnt_3 = float(sum(self.struct_3.values()))
        summary_cnt_4 = float(sum(self.struct_4.values()))

        for key, _ in self.struct_3.iteritems():
            self.struct_3[key] /= summary_cnt_3

        for key, _ in self.struct_4.iteritems():
            self.struct_4[key] /= summary_cnt_4

        self.eps = min(self.struct_3.values() + self.struct_4.values()) * 5e-1
        print '* Langauge model seccessefully fitted'
Пример #3
0
    def __init__(self):
        # Define Supported hashes
        hashes = dict()
        hashes['md2'] = lambda x: self._get_md2_hash(x)
        hashes['md4'] = lambda x: self._get_hashlib_hash('md4', x)
        hashes['md5'] = lambda x: hashlib.md5(x).hexdigest()
        hashes['sha'] = lambda x: self._get_hashlib_hash('sha', x)
        hashes['sha1'] = lambda x: hashlib.sha1(x).hexdigest()
        hashes['sha256'] = lambda x: hashlib.sha256(x).hexdigest()
        hashes['sha224'] = lambda x: hashlib.sha224(x).hexdigest()
        hashes['sha384'] = lambda x: hashlib.sha384(x).hexdigest()
        hashes['sha512'] = lambda x: hashlib.sha512(x).hexdigest()
        hashes['sha3_224'] = lambda x: sha3.sha3_224(x).hexdigest()
        hashes['sha3_256'] = lambda x: sha3.sha3_256(x).hexdigest()
        hashes['sha3_384'] = lambda x: sha3.sha3_384(x).hexdigest()
        hashes['sha3_512'] = lambda x: sha3.sha3_512(x).hexdigest()
        hashes['mmh2'] = lambda x: str(mmhash.get_hash(x))
        hashes['mmh2_unsigned'] = lambda x: str(mmhash.get_unsigned_hash(x))
        hashes['mmh3_32'] = lambda x: str(mmh3.hash(x))
        hashes['mmh3_64_1'] = lambda x: str(mmh3.hash64(x)[0])
        hashes['mmh3_64_2'] = lambda x: str(mmh3.hash64(x)[1])
        hashes['mmh3_128'] = lambda x: str(mmh3.hash128(x))
        hashes['ripemd160'] = lambda x: self._get_hashlib_hash('ripemd160', x)
        hashes['whirlpool'] = lambda x: self._get_hashlib_hash('whirlpool', x)
        hashes['blake2b'] = lambda x: pyblake2.blake2b(x).hexdigest()
        hashes['blake2s'] = lambda x: pyblake2.blake2s(x).hexdigest()
        hashes['crc32'] = lambda x: str(zlib.crc32(x))
        hashes['adler32'] = lambda x: str(zlib.adler32(x))

        self._hashes = hashes
        self.hashes_and_checksums = self._hashes.keys()
        self.supported_hashes = HASHES
        def convert(cont_df):
            task_states = []

            prev_end_time = cont_df.loc[0, "start_time"] * 1000
            container_id = mmh3.hash64(cont_df.loc[0, "container_id"])[1]
            app_id = mmh3.hash64(cont_df.loc[0, "app_du"])[1]

            sorted_df = df.sort_values("time_stamp")
            for index, row in sorted_df.iterrows():
                this_end_time = row["time_stamp"] * 1000

                this_task_state = TaskState(
                    ts_start=prev_end_time,
                    ts_end=this_end_time,
                    workflow_id=app_id,
                    task_id=container_id,
                    resource_id=machine_id,
                    cpu_rate=row["cpu_util_percent"],
                    canonical_memory_usage=row["mem_util_percent"],
                    maximum_disk_bandwidth=row["disk_io_percent"],
                    network_in=row["net_in"],
                    network_out=row["net_out"])

                prev_end_time = this_end_time

                task_states.append(this_task_state.get_parquet_dict())
                if None in this_task_state.get_parquet_dict().values(
                ) or np.isnan(this_task_state.get_parquet_dict().values()):
                    print(this_task_state.get_parquet_dict())
                    raise ArithmeticError(this_task_state.get_parquet_dict())

            return pd.DataFrame(task_states)
Пример #5
0
    def assignObjectWallTime(self):
        """
        Tries to add an object and returns the wall time.
        """
        t = time.time()
        inputStr = str(np.random.random())
        num = mmh3.hash64(inputStr)
        num1 = num[0] & 4294967295
        num2 = num[0] >> 32
        num3 = num[1] & 4294967295
        num4 = num[1] >> 32
        
        num1 = num1 >> 12
        num2 = num2 >> 12
        num3 = num3 >> 12
        num4 = num4 >> 12
        counter = 0

        while (self.serversArray[num1] == -1 or self.fullFlag[self.serversArray[num1]]) and (self.serversArray[num2] == -1 or self.fullFlag[self.serversArray[num2]]) and (self.serversArray[num3] == -1 or self.fullFlag[self.serversArray[num3]]) and (self.serversArray[num4] == -1 or self.fullFlag[self.serversArray[num4]]):
            counter += 1
            
            num = mmh3.hash64(inputStr + hex(counter))
            num1 = num[0] & 1048575 # Always positive
            num2 = num[0] >> 44
            num3 = num[1] & 1048575 # Always positive
            num4 = num[1] >> 44          
        
        return time.time() - t
Пример #6
0
def debug_execution(config, execs, qemu_verbose=False, notifiers=True):
    log_info("Starting...")

    zero_hash = mmh3.hash64(("\xFF" * config.config_values['BITMAP_SHM_SIZE']))
    q = qemu(1337, config, debug_mode=True, notifiers=notifiers)
    q.start(verbose=qemu_verbose)
    q.set_payload(open(config.argument_values["payload"][0]).read())
    start = time.time()
    for i in range(execs):
        print("+----------------------------------------------+")
        current_hash = mmh3.hash64(q.send_payload())
        if zero_hash == current_hash:
            print("Hash: " + str(current_hash) + common.color.WARNING +
                  " (WARNING: Zero hash found!)" + common.color.ENDC)
        else:
            print("Hash: " + str(current_hash))
    end = time.time()
    print("Performance: " + str(execs / (end - start)) + "t/s")

    q.__del__()
    try:
        for i in range(512):
            if os.path.exists("/tmp/kAFL_printf.txt." + str(i)):
                os.remove("/tmp/kAFL_printf.txt." + str(i))
            else:
                break
    except:
        pass
    os.system("stty sane")
    return 0
Пример #7
0
        def assign_workflow_ids(v):
            arr = []
            for i in v.keys():
                if v[i]:
                    arr.append(mmh3.hash64(v[i], signed=True)[0])
                else:
                    arr.append(
                        mmh3.hash64(uuid4().bytes, signed=True)
                        [0])  # Assign a UUID, collision chance is negligible.

            return pd.Series(arr)
Пример #8
0
 def bloom_check(self, buffer):
     if self.ready == 0:
         print("Bloom filter was not created properly\n")
     else:
         hits = 0
         a = mmh3.hash64(buffer)[0]
         b = mmh3.hash64(buffer)[1]
         for i in range(self.hashes):
             x = (a + i * b) % (self.bits)
             if self.bf[x] == 1:
                 hits += 1
         if hits == self.hashes:
             return 1  # might be there
         else:
             return 0  # not there for sure
Пример #9
0
    def _multi_hash(self, document):
        """ Generates a texts minhash signature using multi-hash method.

        Uses i random hashes for j permutations selecting the minimum hash value
        each time to build each texts hash signature.

        Slower but more stable than k smallest hash method.

        Args:
            document (list): List of document shingles.

        Returns:
            list: List of text signatures generated using k smallest neighbours method.

        """
        signature = []
        for seed in np.nditer(self._hash_seeds):
            self._min_value = None
            for shingle in document:
                if self.hash_bits == 64:
                    hash_value = mmh3.hash64(shingle, int(seed))[0]
                elif self.hash_bits == 32:
                    hash_value = mmh3.hash(shingle, int(seed))
                else:
                    hash_value = mmh3.hash128(shingle, int(seed))
                if not self._min_value:
                    self._min_value = hash_value
                elif self._min_value > hash_value:
                    self._min_value = hash_value
            signature.append(self._min_value)
        return signature
Пример #10
0
 def update(self,instance,y):
     for aggKey in self._keys:
         key_for_update = hash64(str(tuple([key+'_'+instance[key] for key in aggKey])))  # hash for memory issue
         temp_list = self._counter_map[key_for_update]
         if len(temp_list) == self.mem_len:
             temp_list.popleft()
         temp_list.append((self.time,y))
Пример #11
0
def compare_python_to_reference_murmur3_64(data: Any, seed: int = 0) -> None:
    """
    Checks the pure Python implementation of 64-bit murmur3 against the
    ``mmh3`` C-based module.

    Args:
        data: data to hash
        seed: seed

    Raises:
        AssertionError: if the two calculations don't match

    """
    assert mmh3, "Need mmh3 module"
    c_data = to_str(data)
    # noinspection PyUnresolvedReferences
    c_signed_low, c_signed_high = mmh3.hash64(c_data, seed=seed,
                                              x64arch=IS_64_BIT)
    py_data = to_bytes(c_data)
    py_signed_low, py_signed_high = pymmh3_hash64(py_data, seed=seed)
    preamble = (
        f"Hashing {data!r} with MurmurHash3/64-bit values from 128-bit "
        f"hash/seed={seed}"
    )
    if c_signed_low == py_signed_low and c_signed_high == py_signed_high:
        print(preamble + f" -> (low={c_signed_low}, high={c_signed_high}): OK")
    else:
        raise AssertionError(
            preamble +
            f"; mmh3 says {c_data!r} -> "
            f"(low={c_signed_low}, high={c_signed_high}), "
            f"Python version says {py_data!r} -> "
            f"(low={py_signed_low}, high={py_signed_high})")
Пример #12
0
 def bloom_add(self, buffer):
     if self.ready == 0:
         print("Bloom filter was not created properly\n")
         return 1
     else:
         if self.no_of_elements < self.entries:
             a = mmh3.hash64(buffer)[0]
             b = mmh3.hash64(buffer)[1]
             for i in range(self.hashes):
                 x = (a + i * b) % (self.bits)
                 self.bf[x] = 1
                 self.no_of_elements += 1
         else:
             print("Bloom filter capacity crossed\n")
             return -1
         return 0
Пример #13
0
def benchmark(config):
    log_info("Starting...")

    q = qemu(1337, config, debug_mode=False)
    q.start(verbose=False)
    q.set_payload(open(config.argument_values["payload"][0]).read())
    print(mmh3.hash64(q.send_payload()))
    try:
        while True:
            start = time.time()
            execs = 0
            while (time.time() - start < REFRESH):
                q.set_payload(
                    open(config.argument_values["payload"][0]).read())
                q.send_payload()
                execs += 1
            end = time.time()
            stdout.write(common.color.FLUSH_LINE + "Performance: " +
                         str(execs / (end - start)) + "t/s")
            stdout.flush()
    except:
        print("\nExit")

    q.__del__()
    try:
        for i in range(512):
            if os.path.exists("/tmp/kAFL_printf.txt." + str(i)):
                os.remove("/tmp/kAFL_printf.txt." + str(i))
            else:
                break
    except:
        pass
    return 0
Пример #14
0
    def _k_smallest_hash(self, document):
        """ Generates a texts minhash signature using k smallest neighbours method.

        Uses a single random hash to simulate a shuffle of each texts shingles.
        Then selecting i smallest minimum hash values for j permutations.

        Faster but less stable than multi hash method.

        Args:
            document (list): List of text shingles.

        Returns:
            list: List of text signatures generated using k smallest neighbours method.

        """
        signature = []
        # Uses a heap to make calculating n smallest values more efficient.
        heapq.heapify(signature)
        if len(document) <= self.permutations:
            raise ValueError(
                'N permutations must not be >= n shingles for k_smallest_values method'
            )
        for shingle in document:
            if self.hash_bits == 64:
                hashed_shingle = mmh3.hash64(shingle, self._hash_seeds)[0]
            elif self.hash_bits == 32:
                hashed_shingle = mmh3.hash(shingle, self._hash_seeds)
            else:
                hashed_shingle = mmh3.hash128(shingle, self._hash_seeds)
            heapq.heappush(signature, hashed_shingle)
        return heapq.nsmallest(self.permutations, signature)
Пример #15
0
 def get_indexes(self, key):
     """
     Generates the indicies corresponding to the given key
     """
     h1, h2 = mmh3.hash64(key)
     for i in xrange(self.num_hashes):
         yield (h1 + i * h2) % self.num_bytes
Пример #16
0
    def insert(self, codelet):
        """
        Insert a codelet into the database.

        :param codelet: The codelet to insert.
        :type codelet: :py:class:`.Codelet`
        """
        query1 = """INSERT INTO code VALUES (?, ?, ?)
                    ON DUPLICATE KEY UPDATE code_id=code_id"""
        query2 = """INSERT INTO codelets VALUES
                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?)"""
        query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"

        hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8")
        code_id = mmh3.hash64(hash_key)[0]

        with self._conn.cursor() as cursor:
            cursor.execute(query1, (code_id, codelet.language, codelet.code))
            if cursor.rowcount == 1:
                for sym_type, symbols in codelet.symbols.iteritems():
                    self._insert_symbols(cursor, code_id, sym_type, symbols)
            origin, url = self._decompose_url(cursor, codelet.url)
            cursor.execute(query2, (codelet.name, code_id, origin, url,
                                    codelet.rank, codelet.date_created,
                                    codelet.date_modified))
            codelet_id = cursor.lastrowid
            authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
            cursor.executemany(query3, authors)
Пример #17
0
def run(encoding_method, files):
    path = './temp_idx/'
    if not os.path.exists(path):
        os.makedirs(path)
    index, url_list = {}, []
    current_partition_id = 0

    for doc_idx, doc in tqdm(files):
        doc_idx = int(doc_idx)
        text = get_doctext(doc_idx)
        url_list.append(doc['url'] + '\n')
        terms = set(extract_words(text))
        if len(terms) == 0:
            # print('Document {0} is empty'.format(doc_idx))
            continue
        for term in terms:
            key = mmh3.hash64(term)[0]
            if key in index:
                index[key].append(doc_idx)
            else:
                # Zero index is used for delta computation for first document in sequence
                index[key] = [0, doc_idx]
        if (doc_idx + 1) % index_partition_size == 0:
            filename = os.path.join(INDEX_PATH,
                                    'part{0:03d}'.format(current_partition_id))
            write_index_partition(filename, index, encoding_method)
            current_partition_id += 1
    filename = os.path.join(INDEX_PATH,
                            'part{0:03d}'.format(current_partition_id))
    write_index_partition(filename, index, encoding_method)

    with open(os.path.join(INDEX_PATH, 'encoding.ini'), 'w') as config_file:
        config_file.write(encoding_method)
    with open(os.path.join(INDEX_PATH, 'url_list'), 'w') as f:
        f.writelines(url_list)
Пример #18
0
def hashing(item):
    h1 = murmur.hash64(item)
    hashes = []
    for i in range(1, k + 1):
        uniq = (h1[0] + i * h1[1]) % m
        hashes.append(uniq)
    return hashes
Пример #19
0
    def might_contain(self, key, num_hash_functions, array):
        bit_size = len(array)
        if isinstance(key, int) and self.INT_MIN <= key <= self.INT_MAX:
            hash1, hash2 = mmh3.hash64(key.to_bytes(4, byteorder="little"))
        elif isinstance(key, int) and self.LONG_MIN <= key <= self.LONG_MAX:
            hash1, hash2 = mmh3.hash64(key.to_bytes(8, byteorder="little"))
        else:
            hash1, hash2 = mmh3.hash64(key)

        combined_hash = hash1
        for _ in range(num_hash_functions):
            index = (combined_hash & self.LONG_MAX) % bit_size
            if not array[index]:
                return False
            combined_hash += hash2
        return True
 def update_min_hash_signature(word, min_hash_signature):
     root_hash = mmh3.hash64(pickle.dumps(word))[0]
     word_hashes = np.bitwise_xor(
         masks, root_hash
     )  # XOR root hash with k randomly generated integers to simulate k hash functions
     min_hash_signature = np.minimum(min_hash_signature, word_hashes)
     return min_hash_signature
Пример #21
0
def process_file(tenant, file_path, pipeline, lines):
    line_counter = lines
    with open(file_path) as f:
        for line in f:
            uuid, tids = line.strip().split(LINE_DELIMITER)
            hashed_bytes = hash64(uuid)[0].to_bytes(HASH_BYTES,
                                                    byteorder=BYTE_ORDER,
                                                    signed=HASH_SIGNED)
            tenant_bytes = int(tenant).to_bytes(1,
                                                byteorder=BYTE_ORDER,
                                                signed=HASH_SIGNED)
            redis_key, redis_field = tenant_bytes + hashed_bytes[:KEY_BYTES], hashed_bytes[
                KEY_BYTES:KEY_BYTES + FIELD_BYTES]

            redis_value = bytearray()
            for tid in tids.split(TID_DELIMITER):
                redis_value.extend(
                    int(tid).to_bytes(TID_BYTES,
                                      byteorder=BYTE_ORDER,
                                      signed=HASH_SIGNED))
            pipeline.hset(redis_key, redis_field, bytes(redis_value))

            line_counter += 1
            if line_counter % PIPELINE_SIZE == 0:
                pipeline.execute()
                logging.info(line_counter)

    pipeline.execute()
    return line_counter
Пример #22
0
    def copy_bitmap(self,
                    shm,
                    num,
                    size,
                    bitmap,
                    payload,
                    payload_size,
                    effector_mode=False):
        new_hash = mmh3.hash64(bitmap)
        if not (self.crashed or self.kasan or self.timeout):
            if new_hash in self.lookup.non_finding:
                if effector_mode:
                    shm.seek(size * num)
                    shm.write(bitmap)
                    return True
                else:
                    shm.seek((size * num) + len(bitmap))
                    return False

        if not (self.crashed or self.kasan
                or self.timeout) and not self.check_for_unseen_bits(bitmap):
            self.lookup.non_finding[new_hash] = None
            return False
        if not (self.timeout):
            bitmap = self.verifiy_input(payload, bitmap, payload_size)
            shm.seek(size * num)
            shm.write(bitmap)
        self.lookup.non_finding[new_hash] = None
        return True
Пример #23
0
    def insert(self, codelet):
        """
        Insert a codelet into the database.

        :param codelet: The codelet to insert.
        :type codelet: :py:class:`.Codelet`
        """
        query1 = """INSERT INTO code VALUES (?, ?, ?)
                    ON DUPLICATE KEY UPDATE code_id=code_id"""
        query2 = """INSERT INTO codelets VALUES
                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?)"""
        query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"

        hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8")
        code_id = mmh3.hash64(hash_key)[0]

        with self._conn.cursor() as cursor:
            cursor.execute(query1, (code_id, codelet.language, codelet.code))
            if cursor.rowcount == 1:
                for sym_type, symbols in codelet.symbols.iteritems():
                    self._insert_symbols(cursor, code_id, sym_type, symbols)
            origin, url = self._decompose_url(cursor, codelet.url)
            cursor.execute(query2,
                           (codelet.name, code_id, origin, url, codelet.rank,
                            codelet.date_created, codelet.date_modified))
            codelet_id = cursor.lastrowid
            authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
            cursor.executemany(query3, authors)
Пример #24
0
Файл: core.py Проект: ufwt/kAFL
def debug_execution(config, execs, qemu_verbose=False, notifiers=True):
    log_debug("Starting debug execution...(%d traces)" % execs)

    payload_file = config.argument_values["input"]
    zero_hash = mmh3.hash64(("\x00" * config.config_values['BITMAP_SHM_SIZE']))
    q = qemu(1337, config, debug_mode=True, notifiers=notifiers)
    q.start()
    start = time.time()
    for i in range(execs):
        log_debug("Launching payload %d/%d.." % (i + 1, execs))
        if i % 3 == 0:
            q.set_payload(read_binary_file(payload_file))
        # time.sleep(0.01 * rand.int(0, 9))
        # a = str(q.send_payload())
        # hexdump(a)
        result = q.send_payload()
        current_hash = result.hash()
        if zero_hash == current_hash:
            log_debug("Feedback Hash: " + str(current_hash) +
                      common.color.WARNING + " (WARNING: Zero hash found!)" +
                      common.color.ENDC)
        else:
            log_debug("Feedback Hash: " + str(current_hash))
            #log_debug("Full hexdump:\n" + hexdump(result.copy_to_array()))
        if result.is_crash():
            q.restart()

    q.shutdown()
    end = time.time()
    print("Performance: " + str(execs / (end - start)) + "t/s")

    return 0
Пример #25
0
def compute(filename, *, k=21, scaled=1000, output=None, **kwargs):
    import mmh3
    import screed

    # compute the actual hashes to insert by breaking down the sequence
    # into k-mers and applying MurmurHash to each one; here, the only
    # interesting thing that is done by add() is to keep only the
    # hashes smaller than max_hash, where max_hash =  2^64 / scaled.

    mh = ScaledMinHash(k=k, scaled=scaled, filename=filename)
    name = None
    with screed.open(filename) as f:
        for record in f:
            if name is None:
                name = record.name

            for kmer in canonical_kmers(record.sequence, k):
                h = mmh3.hash64(kmer, seed=42)[0]

                # convert to unsigned int if negative
                if h < 0:
                    h += 2**64

                mh.add(h)

    mh.name = name
    if output is None:
        output = sys.stdout

    mh.save(output)
Пример #26
0
    def copy_bitmap(self,
                    shm,
                    num,
                    size,
                    bitmap,
                    payload,
                    payload_size,
                    effector_mode_hash=None,
                    apply_patches=True):
        if self.crashed or self.kasan or self.timeout:
            shm.seek(size * num)
            shm.write(bitmap)
            return True

        new_hash = mmh3.hash64(bitmap)

        if effector_mode_hash and effector_mode_hash != new_hash:
            shm.seek(size * num)
            shm.write(bitmap)
            return True

        if self.lookup.check_value(new_hash):
            shm.seek((size * num) + len(bitmap))
            return False

        if not (self.timeout) and not self.check_for_unseen_bits(bitmap):
            self.lookup.set_value(new_hash)
            return False
        shm.seek(size * num)
        shm.write(bitmap)
        self.lookup.set_value(new_hash)

        return True
Пример #27
0
 def find(self, item):
     key = mmh3.hash64(item)[0]
     bucket_idx = key % self.buckets_count
     target_bucket = self.buckets[bucket_idx]
     term_idx = TermDictionary.__binary_search(target_bucket[:, 0], key)
     offset, n_bytes = target_bucket[term_idx, 1], target_bucket[term_idx, 2]
     return offset, n_bytes
Пример #28
0
    def get_hash(cls, item, case_sensitive):
        """ Returns two hash values computed using single hash function on item
        ...
        Parameters
        ----------
        item : object
            The input element to add or check
        case_sensitive: bool
            Flag indicates if item value is case sensitive

        Returns
        -------
        tuple
            Two hash values computing using mmh3 hash function
        """
        try:
            # If case sensitive is False, convert item to lowercase for add and exist methods, to enable case difference
            if case_sensitive:
                item = item.lower()

            # Hashing is compute intensive. Hence we compute it only once.
            # Using non-cryptographic hash function is highly performance efficient and serves purpose of bloom filter.
            # mmh3.hash64 hash function returns two hash values for input item
            hash_vals = mmh3.hash64(item)
            return hash_vals
        except Exception as e:
            logger.log(str(e))
            sys.exit(1)
def exec_tree(query_tree, Term_dict, InvIndexEncoded, num_docID):
    if(query_tree is None):
        return set()
    if(query_tree.is_operator):
        S1 = exec_tree(query_tree.left, Term_dict, InvIndexEncoded, num_docID)
        S2 = exec_tree(query_tree.right, Term_dict, InvIndexEncoded, num_docID)
        op = query_tree.value
        if(op == '!'):
            S = set(range(num_docID))
            return S - S2
        elif(op == '&'):
            return S1 & S2
        elif(op == '|'):
            return S1 | S2
        else:
            print "ERROR"
            return None
    else:
        ###query_tree.is_term == True
        term_hash = mmh3.hash64(query_tree.value)[0]
        if(term_hash in Term_dict):
            substr = InvIndexEncoded[Term_dict[term_hash][0] : Term_dict[term_hash][0] + Term_dict[term_hash][1]]
            return set(encoder.decode(substr))
        else:
            return set()
Пример #30
0
	def readHash(self):
		hll = Hll(self.p)
		x = sys.stdin.readline().rstrip('\n')
		while x:
			hll.AddItem64(mmh3.hash64(str(x))[0])
			x = sys.stdin.readline().rstrip('\n')
		print hll.Count64()
 def _indexes(self, key):
     """
     Generates the indicies corresponding to the given key
     """
     h1, h2 = mmh3.hash64(key)
     for i in xrange(self.num_hashes):
         yield (h1 + i * h2) % self.num_bytes
def client():
    nodes_ = []
    # position servers on the hash ring
    for i in range(len(servers)):
        nodes_.append(Node(servers[i], 123 * i, 100))

    # now reading data in the csv and pushing it to selected server
    with open('causes-of-death.csv', mode='r') as csvfile:
        csvreader_ = csv.reader(csvfile, delimiter=',')
        row_number_ = 0
        for row_ in csvreader_:
            if row_number_ == 0:
                # skip header
                row_number_ += 1
                continue
            #print(', '.join(row_))
            key_ = "%s:%s:%s" % (row_[0], row_[2], row_[3])
            hash_ = mmh3.hash64(key_)[0]
            node_ = determine_responsible_node(nodes_, key_)
            #print("{} - {} - {}".format(hash_, str(node_), ','.join(row_)))
            payload_ = {'{}'.format(hash_): ','.join(row_)}
            #print(payload_)
            # prepare http request and post payload
            req_ = urllib.request.Request("{}/api/v1/entries".format(
                node_.name_))
            req_.add_header('Content-Type', 'application/json; charset=utf-8')
            json_data_ = json.dumps(payload_)
            json_data_bytes_ = json_data_.encode('utf-8')
            req_.add_header('Content-Length', len(json_data_bytes_))
            resp_ = urllib.request.urlopen(req_, json_data_bytes_)
            print("{} - {}".format(resp_.reason, payload_))
Пример #33
0
 def murmur3_64bit(obj):
     """
     Use murmur3_64bit for 64 bit hash by passing this method:
     hasher=DeepHash.murmur3_64bit
     """
     obj = obj.encode('utf-8')
     # This version of murmur3 returns two 64bit integers.
     return mmh3.hash64(obj, MURMUR_SEED)[0]
Пример #34
0
 def murmur3_64bit(obj):
     """
     Use murmur3_64bit for 64 bit hash by passing this method:
     hasher=DeepHash.murmur3_64bit
     """
     obj = obj.encode('utf-8')
     # This version of murmur3 returns two 64bit integers.
     return mmh3.hash64(obj, MURMUR_SEED)[0]
Пример #35
0
def find_lsh_buckets(hash_signature, lsh_band_width, lsh_num_buckets):
    bands = [
        tuple(hash_signature[i:i + lsh_band_width])
        for i in range(0, len(hash_signature), lsh_band_width)
    ]
    lsh_hashes = [(mmh3.hash64(pickle.dumps(row))[0] % lsh_num_buckets)
                  for row in bands]
    return lsh_hashes
Пример #36
0
 def update_min_hash_signature(self, word, min_hash_signature):
     root_hash = mmh3.hash64(word.encode("ascii", "ignore"))[0]
     # root_hash = mmh3.hash64(pickle.dumps(word))[0]  # For MinHashing shingles
     word_hashes = np.bitwise_xor(
         self._masks, root_hash
     )  # XOR root hash with k randomly generated integers to simulate k hash functions, can add bitroll if there's time
     min_hash_signature = np.minimum(min_hash_signature, word_hashes)
     return min_hash_signature
Пример #37
0
 def gen_features(self,instance,logtime,D):
     # generate features based on instance's attribute.
     # For each key, we generate hash((bin(logtime-time[i]),i,lastY[i])) % D
     for aggKey in self._keys:
         key_for_feature = hash64(str(tuple([key+'_'+instance[key] for key in aggKey])))
         for idx, content in enumerate(self._counter_map[key_for_feature]):
             time, lastY = content
             val = int(log((logtime - time).total_seconds() + 1.))
             yield abs(hash(str(aggKey)+'_'+str(idx)+'_'+str((val,lastY)))) % D , 1.
Пример #38
0
def murmurhash3_64(item, seed = 0):
    """
    Murmurhash 3 for 64-bit integers (returns the first of a tuple of two)
    """
    if type(item) is not str: 
        item = str(item)
    if type(seed) is not int:
        seed = int(seed)
    return mmh3.hash64(item, seed = seed)
Пример #39
0
 def params_stand(self, infos, res_id):
     """params to stand DB"""
     infos['issue_time'] = infos.pop("create_time")
     infos['res_id'] = res_id
     hstr = "%snaviappfeedback" % infos['res_id']
     infos['mid'] = mmh3.hash64(hstr)[0]    #unique
     infos['intelligence_source'] = 31
     infos['dispatch_flag'] = 0
     infos['update_time'] = infos['commit_time']
Пример #40
0
def hash64(key, seed):
    """
    Wrapper around mmh3.hash64 to get us single 64-bit value.

    This also does the extra work of ensuring that we always treat the
    returned values as big-endian unsigned long, like smhasher used to
    do.
    """
    hash_val = mmh3.hash64(key, seed)[0]
    return struct.unpack('>Q', struct.pack('q', hash_val))[0]
Пример #41
0
        def go(self):
            	hll = Hll(self.p)
                hashvalues = []
                for x in range (0,int(self.n)):
			hashvalues.append(mmh3.hash64(str(x))[0])
                debut = int(round (time.time() * 1000))
                for i in range(0, int(self.n)):
			hll.AddItem64(hashvalues[i])
                fin = int(round (time.time() * 1000))
		print hll.Count64()
                print "temps = "+str(fin-debut)+"ms";
Пример #42
0
def find_match(threadName, size, ohash):
  global match
  global solution
  global found_by
  while (match == 0):
    rands = str(bytearray(os.urandom(size)))
    h1 = mmh3.hash64(rands)
    if (h1 == ohash):
      solution = rands
      match = 1
      found_by = threadName
Пример #43
0
def sim_shi4_mm3(text):
    # NB: It makes quite little sense to use both 64bit numbers to compare
    # hashes as pairwise Hamming distance using high 64bit is highly correlated
    # with the distance computed using low 64bit. It's actually expected, but
    # it means, that summing these distances is not linear and should be avoided.
    # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e
    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3): # 4 words per shingle
        next(i2, None)
    mm = [mmh3.hash64(text[m1.start():m2.end()]) for m1, m2 in itertools.izip(i1, i2)]
    return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]),
            simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
Пример #44
0
def tx_partition(app, txid):
  """ Return a blob hash for a given application and transaction ID.

  Args:
    app: A string specifying the application ID.
    txid: An integer specifying the transaction ID.
  Returns:
    A bytearray that can be used as the transaction partition key.
  """
  murmur_int = mmh3.hash64(app + str(txid))[0]
  # Distribute the integer range evenly across the byte ordered token range.
  return bytearray(struct.pack('<q', murmur_int))
Пример #45
0
def compressor_worker():
    while not q.empty():
        w_base, w_rel_base, w_f = q.get()

        w_rel_base = '' if w_rel_base == '.' else w_rel_base

        abs_path = os.path.join(w_base, w_f)
        rel_path = os.path.join(w_rel_base, w_f)

        extension = os.path.splitext(rel_path)[1][1:]

        raw_filestring = open(abs_path).read()
        compressed_filestring = lzo.compress(raw_filestring, options.compression)

        len_raw = len(raw_filestring)
        len_compressed = len(compressed_filestring)

        compression_factor = (float(len_compressed) / len_raw) if len_raw else 0
        compression_used = False

        if compression_factor < options.cutoff and False:
            compression_used = True

        string_final = compressed_filestring if compression_used else raw_filestring
        len_final = len(string_final)
        adler32_final = lzo.adler32(string_final)

        compressed_data_chunks.append({
            'path': rel_path,
            'path_mmh3': mmh3.hash64(rel_path)[0],
            'adler32': adler32_final,
            'size_before': len_raw,
            'size_after': len_final,
            'factor': compression_factor,
            'compression': 1 if compression_used else 0,
            'extension_str': extension,
            'extension': extensions[extension] if extension in extensions else 0,
            'data': string_final
        })

        if options.verbose:
            print('\t'.join((
                'Y' if compression_used else 'N',
                extension,
                '%.02f' % (compression_factor * 100.0),
                str(len_raw / 1024),
                str(len_final / 1024),
                str(adler32_final),
                rel_path
            )))

        q.task_done()
Пример #46
0
def map_sketch(input, options):
  '''
  Returns a list of sketches (x,r, pos, d)
  x = hashed sketch
  r = ID of original sequence
  pos = index of kmer starting position in original sequence
  d = count of kmers extracted
  '''
  id_seq = input.split('\t')
  # pp.pprint(type(id_seq[0]))
  sketches = [(mmh3.hash64(i[0])[0], (int(id_seq[0]), i[1], len(id_seq[1]) - options.kmer + 1)) for i in gen_kmers(id_seq[1], options)]
  # if (mmh3.hash64(i)[0] % options.mod == 0)
  return sketches
Пример #47
0
def hash_line(line, n, size, order=1):
    line = line.strip().lower().split()
    res = []
    for w in ngrams(line, order):
        h1, h2 = mmh3.hash64(w)
        for s in range(n):
            hashval = (h1 + s * h2) % size
            res.append(int(hashval))
        # res.append(hash("%s\t%s" % (s, w)) % size)
    res = list(set(res))
    res.sort()
    # print ("%d => %d" %(len(line), len(res)))
    return res
Пример #48
0
def test_compute_librarylink_hash(inputdata, expected):
    bits128 = mmh3.hash64(inputdata)
    bits64 = bits128[0]
    hexbits64 = hex(bits64)
    hexbits128 = [ hex(x) for x in bits128 ]
    octets = struct.pack('!q', bits64)
    octets_raw = [ hex(c) for c in octets ]
    octets_rawer = [ hex(c)[2:].zfill(2) for c in octets ]
    encoded = base64.urlsafe_b64encode(octets).rstrip(b"=")
    encoded_unsafe = base64.b64encode(octets).rstrip(b"=")
    assert encoded == expected, (encoded, expected)
    encoded = simple_hashstring(inputdata).encode('ascii')
    assert encoded == expected, (encoded, expected)
Пример #49
0
    def search(self, tree, page=1):
        """
        Search the database for a query and return the *n*\ th page of results.

        :param tree: The query to search for.
        :type tree: :py:class:`~.query.tree.Tree`
        :param page: The result page to display.
        :type page: int

        :return: The total number of results, and the *n*\ th page of results.
        :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s)
        """
        query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
        query2 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp
                    FROM cache
                    INNER JOIN cache_data ON cache_id = cdata_cache
                    WHERE cache_id = ?
                    ORDER BY cdata_index ASC"""
        query3 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)"
        query4 = "INSERT INTO cache_data VALUES (?, ?, ?)"

        cache_id = mmh3.hash64(str(page) + ":" + tree.serialize())[0]

        with self._conn.cursor() as cursor:
            cursor.execute(query1, (cache_id,))
            cache_hit = cursor.fetchall()
            if cache_hit:
                cursor.execute(query2, (cache_id,))
                rows = cursor.fetchall()
                num_results = rows[0][1] * (10 ** rows[0][2]) if rows else 0
                ids = [row[0] for row in rows]
            else:
                ids, num_results = self._search_with_query(cursor, tree, page)
                num_exp = max(len(str(num_results)) - 3, 0)
                num_results = int(round(num_results, -num_exp))
                num_mnt = num_results / (10 ** num_exp)
                cursor.execute(query3, (cache_id, num_mnt, num_exp))
                cdata = [(cache_id, c_id, i) for i, c_id in enumerate(ids)]
                cursor.executemany(query4, cdata)
            codelet_gen = self._get_codelets_from_ids(cursor, ids, tree)
            return (num_results, list(codelet_gen))
Пример #50
0
    def post(self):
        status, dinfo = self.storinfo()
        if status == "error":
            self.jsonError({"msg":status})
            return 

        # insert into source_db
        try:
            #sourcedb_result = yield self.db_s.ns_map_infoplat.rf_naviapp_feedback.update(\
            #                  {"ugc_id":dinfo['ugc_id']}, dinfo, upsert=True, w=True)
            sourcedb_result = yield self.db_s.ns_map_infoplat.rf_naviapp_feedback.insert(dinfo)
        except pymongo.errors.ConnectionFailure:
            self.jsonError({"msg": "connect mongodb timeout"})
            return 
        '''
        dinfo['res_id'] = sourcedb_result.get("upserted", "ERROR")
        if dinfo['res_id'] == "ERROR":
            self.jsonError({"msg": "insert source_db ERROR, ugc_id is duplicated"})
            return 
        '''

        dinfo['res_id'] = sourcedb_result
        hstr = "%snaviappfeedback" % dinfo['res_id']
        dinfo['mid'] = mmh3.hash64(hstr)[0]    #unique
        dinfo['intelligence_source'] = 31
        dinfo['dispatch_flag'] = 0

        #form geom in mongodb
        for geo_field in ("siwei_link1_list", "siwei_link2_list", "current_path_list", "current_track_list"):
            dinfo[geo_field] = self.form_dict_geom(dinfo[geo_field])

        # insert into stand_db
        try:
            standdb_result = yield self.db_r.info.inte_naviapp_feedback.insert(dinfo)
        except pymongo.errors.PyMongoError as e:
            self.jsonError({"msg": e})
            return 
        
        self.jsonOk()

        '''
Пример #51
0
  def __init__(self, url):
    self.url = urlnorm.norm(url)
    self.tld = get_tld(url).encode('ascii','ignore')

    self.crawl_data = {}
    self.crawl_data['failure'] = False
    self.crawl_data['url'] = self.url
    self.crawl_data['id'] = mmh3.hash64(self.url)[0]

    self.cassandra_cluster = ['127.0.0.1']
    self.keyspace = 'crawlr'
    self.cluster = Cluster(self.cassandra_cluster)
    self.session = self.cluster.connect(self.keyspace)
    self.session.row_factory = dict_factory

    # Prepared Cassandra queries.
    self.check_prepped_stmt = self.session.prepare(
      """
        SELECT id FROM pages WHERE id = ?;
      """)
    self.add_fail_prepped_stmt = self.session.prepare(
      """
        UPDATE failure_counts SET failures = failures + 1 WHERE id = ?;
      """)
    self.del_fail_prepped_stmt = self.session.prepare(
      """
        DELETE failures FROM failure_counts where id = ?;
      """)
    self.add_crawl_prepped_stmt = self.session.prepare(
      """
        INSERT INTO pages (
          id,
          url,
          crawled_at,
          failure,
          title, 
          body,
          internal_links,
          outbound_links)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
      """)
Пример #52
0
def simple_hashstring(obj, bits=64):
    '''
    Creates a simple hash in brief string form from obj
    bits is an optional bit width, defaulting to 64, and should be in multiples of 8 with a maximum of 64

    >>> from bibframe.contrib.datachefids import simple_hashstring
    >>> simple_hashstring("The quick brown fox jumps over the lazy dog")
    'bBsHvHu8S-M'
    >>> simple_hashstring("The quick brown fox jumps over the lazy dog", bits=48)
    'B7x7vEvj'
    '''
    #Useful discussion of techniques here: http://stackoverflow.com/questions/1303021/shortest-hash-in-python-to-name-cache-files
    #Use MurmurHash3
    #Get a 64-bit integer, the first half of the 128-bit tuple from mmh and then bit shift it to get the desired bit length
    basis = mmh3.hash64(str(obj))[0] >> (64-bits)
    if bits == 64:
        raw_hash = struct.pack('!q', basis)
    else:
        raw_hash = struct.pack('!q', basis)[:-int((64-bits)/8)]
    hashstr = base64.urlsafe_b64encode(raw_hash).rstrip(b"=")
    return hashstr.decode('ascii')
Пример #53
0
def simple_hashstring(obj, bits=48):
    '''
    Creates a simple hash in brief string form from obj
    bits is an optional bit width, defaulting to 48, and should be in multiples of 8

    >>> from datachef.ids import simple_hashstring
    >>> simple_hashstring("The quick brown fox jumps over the lazy dog")
    B7x7vEvj
    '''
    #Useful discussion of techniques here: http://stackoverflow.com/questions/1303021/shortest-hash-in-python-to-name-cache-files

    #Abandoned idea of using MD5 and truncating
    #raw_hash = hashlib.md5(title).digest()
    #Abandoned Adler32 for MurmurHash3
    #raw_hash = struct.pack('i', zlib.adler32(title[:plain_len]))
    #Use MurmurHash3
    #Get a 64-bit integer, the first half of the 128-bit tuple from mmh and then bit shift it to get the desired bit length
    basis = mmh3.hash64(str(obj))[0] >> (64-bits)
    raw_hash = struct.pack('l', basis)[:-int((64-bits)/8)]
    hashstr = base64.urlsafe_b64encode(raw_hash).rstrip(b"=")
    return hashstr.decode('ascii')
Пример #54
0
CARD = 100000
NB = 1000
STEP = 500

values = []

# Un tableau de CARD/STEP cardinalites
for i in range(CARD/STEP):
    values.append([])
x = 0


for x in range(NB):
    hll = Hll(14)
    for j in range(CARD):
        hll.AddItem64(mmh3.hash64(str(random.randrange(0,CARD*10)))[0])
        # Tous les STEP
        if j%STEP == 0:
            count = hll.Count64()
            x1 =  min(valuesdata, key=lambda x:abs(x-count))
            y1 = data[x1]
            if valuesdata.index(x1)+1 != len(valuesdata):
               x2 = valuesdata[valuesdata.index(x1)+1]
               y2 = data[x2]
               inter = interpolation(count,x1,y1,x2,y2)
               values[j/STEP].append(inter)
            else:
               values[j/STEP].append(x2)
         

Пример #55
0
CARD = 100000
NB = 100
STEP = 500

# Un tableau de CARD/STEP cardinalites
for i in range(CARD/STEP):
    values.append([])
x = 0
z = 0
# On fait NB tests
for x in range(NB):
    print x
    hll = Hll(14)
    for j in range(CARD):
        hll.AddItem64(mmh3.hash64(str(z))[0])
        z+=1
        # Tous les STEP
        if j%STEP == 0:
            count = hll.Count64()
            values[j/STEP].append(count)


for i in range(len(values)):
    sum = 0
    for j in range(len(values[i])):
        sum = sum + values[i][j]
    avg = sum / len(values[i])
    
    mid = percentile(values[i],0.50)
    one = percentile(values[i],0.01)
Пример #56
0
Файл: vk.py Проект: asash/vkpy
def vk_hash(string):
    return mmh3.hash64(string)[0]
Пример #57
0
 def _indexes(self, key):
     h1, h2 = mmh3.hash64(key)
     for i in xrange(self.num_hashes):
         yield (h1 + i * h2) % self.num_bits
Пример #58
0
 def test_hash64(self):
     h1, h2 = mmh3.hash64('hello')
     assert (h1, h2) == (-3758069500696749310, 6565844092913065241)
Пример #59
0
def generate_hash(s):
    """Generates hash for the string argument."""

    k1, k2 = mmh3.hash64(s)
    anded = 0xFFFFFFFFFFFFFFFF
    return '%016x%016x' % (k1 & anded, k2 & anded)
Пример #60
0
def main(argv):
	input = ["book", "surprise", "and", "nightmare", "or", "caravans", "not", "pray"]
	actions = ["and", "or", "not"]
	result = []
	folderHashMap = {}
	wordsHashMap = {}
	indexDirect = {}
	indexReversed = {}
	if len(argv) > 0:
		# print("give something to search for!")
		argv = [x.lower() for x in argv]
		input = argv

	for i in range(1, len(input)):
		if input[i] in actions and input[i-1] in actions:
			return 1										# jet!

	currentDirName = os.path.basename(os.path.abspath(os.curdir))
	
	print('file : ' + '.\\output\\' + currentDirName + '.folderHashMap')
	with open('.\\output\\' + currentDirName + '.folderHashMap', 'r') as infile:
		folderHashMap = json.load(infile)

	print('file : ' + '.\\output\\' + currentDirName + '.wordsHashMap')
	with open('.\\output\\' + currentDirName + '.wordsHashMap', 'r') as infile:
		wordsHashMap = json.load(infile)

	print('file : ' + '.\\output\\' + currentDirName + '.indexDirect')
	with open('.\\output\\' + currentDirName + '.indexDirect', 'r') as infile:
		indexDirect = json.load(infile)

	print('file : ' + '.\\output\\' + currentDirName + '.indexReversed\n')
	with open('.\\output\\' + currentDirName + '.indexReversed', 'r') as infile:
		indexReversed = json.load(infile)

	if len(folderHashMap) > 0 and len(wordsHashMap) > 0 and len(indexDirect) > 0 and len(indexReversed) > 0 :
		if len(input) > 1:
			# remove and operations on 0 position
			for i in range(0, len(input)):
				if i == 0 and input[i] in actions:
					input.pop(0)
					i = 0
				else:
					break
			if len(input) > 1:
				# add missing operations
				for i in range(1, len(input)):
					if input[i] not in actions and input[i-1] not in actions:
						input.insert(i, "and")
						i += 2
				action = input[1]
				# array of arrays like [[f1, nrOfOccurancies1], [f2, nrOfOccurancies2], ...]
				wordKey1=str(mmh3.hash64(input[0])[0])
				wordKey2=str(mmh3.hash64(input[2])[0])
				if wordKey1 in indexReversed:
					files_1 = [item[0] for item in indexReversed[wordKey1]]  
				else:
					files_1=[]
					action="or"

				if wordKey2 in indexReversed:	
					files_2 = [item[0] for item in indexReversed[wordKey2]]
				else:
					files_2=[]
					action="or"

				f1_len = len(files_1)
				f2_len = len(files_2)
				
				if action == "and":
					if f1_len > f2_len:
						result = [item for item in files_2 if item in files_1]
					else:
						result = [item for item in files_1 if item in files_2]
				elif action == "or":
					if f1_len > f2_len:
						aux = files_1
						result = [item for item in files_2 if item not in aux]
						result += aux
					else:
						aux = files_2
						result = [item for item in files_1 if item not in aux]
						result += aux
				elif action == "not":
					result = [item for item in files_1 if item not in files_2]
				else:
					print("what the ???\nsomthing is definitely wrong here!")

				print(input[0] + " " + input[1] + " " + input[2])
				for r in result:
					print(folderHashMap[str(r)])
				print("\n")

				size = len(input)
				if size > 3:
					for i in range(3, size-1): 
						if input[i].lower() in actions and input[i+1] not in actions:
							action = input[i]
							# array of arrays like [[f1, nrOfOccurancies1], [f2, nrOfOccurancies2], ...]
							files_1 = result
							wordKey2=str(mmh3.hash64(input[i+1])[0])
							if wordKey2 in indexReversed:
								files_2 = [item[0] for item in indexReversed[wordKey2]]
							else:
								files_2=[]
								action="or"

							f1_len = len(files_1)
							f2_len = len(files_2)
							
							if action == "and":
								if f1_len > f2_len:
									result = [item for item in files_2 if item in files_1]
								else:
									result = [item for item in files_1 if item in files_2]
							elif action == "or":
								if f1_len > f2_len:
									aux = files_1
									result = [item for item in files_2 if item not in aux]
									result += aux
								else:
									aux = files_2
									result = [item for item in files_1 if item not in aux]
									result += aux
							elif action == "not":
								result = [item for item in files_1 if item not in files_2]
							else:
								print("what the ???\nsomthing is definitely wrong here!")
							
							print("previous " + input[i] + " " + input[i+1])
							for r in result:
								print(folderHashMap[str(r)])
							print("\n")
			else:
				if input[0] not in actions:
					key = str(mmh3.hash64(input[0])[0])
					if key in indexReversed:
						result = [item[0] for item in indexReversed[key]]
		else:
			if input[0] not in actions:
				key = str(mmh3.hash64(input[0])[0])
				if key in indexReversed:
					result = [item[0] for item in indexReversed[key]]
	else:
		print("sorry, have no data to search in.")

	print("Files for \"" + str(input) + "\" query : ")
	for r in result:
		print(folderHashMap[str(r)])