def get_feature_res(cursor, feature, extra_selector=""): cursor.execute("SELECT DISTINCT(user_id) from {}".format(table_name)) cb_total = 0.0 num_vals = 0.0 cb_count = 0.0 fp_to_count_cross = {} fp_to_count_single = {} data = cursor.fetchall() for user_id, in data: cb_prints = [] cursor.execute("SELECT image_id from {} where user_id='{}' {}".format( table_name, user_id, extra_selector)) ids = [x for x, in cursor.fetchall()] for image_id in ids: cb_prints.append( Fingerprint(cursor, image_id, table_name, Fingerprint_Type.CROSS, feature)) single_fp = Fingerprint(cursor, image_id, table_name, Fingerprint_Type.SINGLE, feature) if single_fp in fp_to_count_single: fp_to_count_single[single_fp] += 1 else: fp_to_count_single.update({single_fp: 1}) if len(ids) > 1: cb_total += 1.0 if is_all_same(cb_prints): cb_count += 1.0 fp = cb_prints[0] if fp in fp_to_count_cross: fp_to_count_cross[fp] += 1 else: fp_to_count_cross.update({fp: 1}) cb_distinct = float(len(fp_to_count_cross)) cb_unique = 0.0 for _, count in fp_to_count_cross.items(): if count == 1: cb_unique += 1.0 single_distinct = float(len(fp_to_count_single)) single_unique = 0.0 for _, count in fp_to_count_single.items(): if count == 1: single_unique += 1.0 cb_total = max(cb_total, 1.0) single_distinct = max(single_distinct, 1.0) cb_distinct = max(cb_distinct, 1.0) frmt = "{:3.1f}%" return frmt.format(single_unique / single_distinct * 100), frmt.format( cb_count / cb_total * 100), frmt.format(cb_unique / cb_distinct * 100)
def __cross_helper(self, b1, b2, cursor, table_name, attrs, extra_selector): cursor.execute("SELECT user_id FROM {} WHERE browser='{}' {}".format(table_name, b1, extra_selector)) tuids = [uid for uid, in cursor.fetchall()] uids = [] for uid in tuids: cursor.execute("SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}' {}".format(table_name, uid, b2, extra_selector)) for uid, in cursor.fetchall(): uids.append(uid) if len(uids) is 0: return None fp_to_count = {} num_cross_browser = 0.0 for uid in uids: cursor.execute("SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'".format(table_name, b1, uid)) image1_id = cursor.fetchone()[0] cursor.execute("SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'".format(table_name, b2, uid)) image2_id = cursor.fetchone()[0] fp_1 = Fingerprint(cursor, image1_id, table_name, Fingerprint_Type.CROSS, attrs, b2) fp_2 = Fingerprint(cursor, image2_id, table_name, Fingerprint_Type.CROSS, attrs, b1) if fp_1 == fp_2: num_cross_browser += 1 if fp_1 in fp_to_count: fp_to_count[fp_1] += 1 else: fp_to_count.update( { fp_1: 1 } ) entropy = 0.0 num_distinct = max(float(len(fp_to_count)), 1.0) num_unique = 0.0 for _, count in fp_to_count.items(): if count == 1: num_unique += 1.0 P = float(count) / float(num_cross_browser) entropy -= P * math.log(P, 2) num_uids = max(float(len(uids)), 1.0) num_cross_browser = max(num_cross_browser, 1.0) return int(num_uids), num_cross_browser/num_uids, num_unique/num_cross_browser, entropy, num_cross_browser
def get_fingerprints_countermeasure(self, countermeasure): fps = self.collection.find({'countermeasure': countermeasure}) fp_objects = [] for fingerprint in fps: fp_objects.append(Fingerprint(fingerprint)) return fp_objects
def get_fingerprints_experiments( cur, min_nb_fingerprints, attributes, id_file="./data/consistent_extension_ids.csv"): """ Returns a list of the fingerprints to use for the experiment We get only fingerprints whose associated user has at least min_nb_fingerprints and who have no inconsistency """ with open(id_file, "r") as f: # we jump header f.readline() ids_query = [] for line in f.readlines(): ids_query.append("'" + line.replace("\n", "") + "'") ids_query = ",".join(ids_query) cur.execute( "SELECT *, NULL as canvasJS FROM extensionDataScheme WHERE \ id in (" + ids_query + ") and \ id in (SELECT id FROM extensionDataScheme GROUP BY \ id having count(*) > " + str(min_nb_fingerprints) + ")\ ORDER by counter ASC") fps = cur.fetchall() fp_set = [] for fp in fps: try: fp_set.append(Fingerprint(attributes, fp)) except Exception as e: print(e) return fp_set
def run(self): if len(self.fp_vectors) == 0: return self.timer.start() while True: with self.lock: if self.die is True: return try: msg = self.input_queue.get(timeout=1) except: continue with self.lock: for vector in self.fp_map: if Fingerprint.cmp_fv( msg, vector[0], self.fingerprint['features']) is True: # check if vector already matched if vector[1] is None: vector[1] = msg vector[2] = self.slice self.matched_vectors += 1 self.input_queue.task_done()
def get_all_fingerprints(self): fps = self.collection.find() fp_objects = [] for fingerprint in fps: fp_objects.append(Fingerprint(fingerprint)) return fp_objects
def create_fingerprints(peaks, fan_value=15): """ Create fingerprints for all the peaks. fingerprint = hash:time hash = (f1, f2, t2 - t1) time = t1 """ prints = [] peaks = list(peaks) for i in range(len(peaks)): for j in range(1, fan_value): if (i + j) < len(peaks): f1 = peaks[i][0] f2 = peaks[i + j][0] t1 = peaks[i][1] t2 = peaks[i + j][1] t_delta = t2 - t1 # Hashes must be within 200s of each other if t_delta >= 0 and t_delta <= 200: h = '{},{},{}'.format(f1, f2, t_delta) p = Fingerprint(h, t1) prints.append(p) return list(set(prints))
def detect(self, X, y=None, threshold=None): """Predict whether samples of X are anomalous or not. Parameters ---------- X : np.array of shape=(n_samples,) Flows for fitting FlowPrint. y : Ignored threshold : float, default=None Minimum required threshold to consider point benign. If None is given, use FlowPrint default Returns ------- result : np.array of shape=(n_samples,) Prediction of samples in X: +1 if benign, -1 if anomalous. """ # Get best match for each fingerprint prediction = self.predict(X, default=Fingerprint()) # Compute match score between each best match prediction = np.asarray( [x.compare(fp) for x, fp in zip(X, prediction)]) # Return whether matching score is high enough return (prediction >= (threshold or self.threshold)) * 2 - 1
def __single_helper(self, b, cursor, table_name, attrs, extra_selector): cursor.execute("SELECT image_id FROM {} WHERE browser='{}' {}".format(table_name, b, extra_selector)) image_ids = [uid for uid, in cursor.fetchall()] if len(image_ids) is 0: return None fp_to_count = {} for uid in image_ids: fp = Fingerprint(cursor, uid, table_name, Fingerprint_Type.SINGLE, attrs) if fp in fp_to_count: fp_to_count[fp] += 1 else: fp_to_count.update( { fp : 1 } ) num_distinct = max(float(len(fp_to_count)), 1.0) num_unique = 0.0 for _, count in fp_to_count.items(): if count == 1: num_unique += 1.0 num_uids = max(len(image_ids), 1.0) return int(num_uids), num_unique/num_uids
def merge_fingerprints(self, fingerprints, threshold=1): """Merge fingerprints based on similarity. Parameters ---------- fingerprints : list List of fingerprints to merge. Returns ------- result : list Merged fingerprints """ #################################################################### # Case default: all fingerprints are different # #################################################################### result = np.asarray(fingerprints) # Retrieve unique fingerprints unique = sorted(set(fingerprints)) #################################################################### # Case 1: all fingerprints are equal # #################################################################### if threshold <= 0: # Create one big merged fingerprint out of all unique fingerprints result[:] = Fingerprint(set().union(*unique)) #################################################################### # Case 2: Merge fingerprints by 0 < threshold < 1 # #################################################################### elif threshold < 1: # Initialise fingerprinting pairs to merge pairs = set([ # Define pairs (fp1, fp2) # For each combination of pairs for fp1, fp2 in self.score_combinations(unique, threshold) # Where similarity >= threshold if fp1.compare(fp2) >= threshold ]) # Create mapping of original fingerprint -> merged fingerprint mapping = dict() # Loop over all fingerprints to be merged for fp1, fp2 in pairs: # Create merged fingerprint fp_merged = mapping.get(fp1, fp1).merge(mapping.get(fp2, fp2)) # Set mappings mapping[fp1] = fp_merged mapping[fp2] = fp_merged # Apply mapping result = np.array([mapping.get(fp, fp) for fp in fingerprints]) #################################################################### # Return merged fingerprints # #################################################################### return result
def load(self, *files, store=True, parameters=False): """Load fingerprints from files. Parameters ---------- file : string Files from which to load fingerprints. store : boolean, default=True If True, store fingerprints in FlowPrint object parameters : boolean, default=False If True, also update FlowPrint parameters from file Returns ------- result : dict of Fingerprint -> label Fingerprints imported from file. """ # Initialise fingerprints fingerprints = dict() # Loop over all files for file in files: # Open input file with open(file, 'r') as infile: # Load fingerprints data = json.load(infile) # Store parameters if necessary if parameters: self.batch = data.get('batch', self.batch) self.window = data.get('window', self.window) self.correlation = data.get('correlation', self.correlation) self.similarity = data.get('similarity', self.similarity) self.threshold = data.get('threshold', self.threshold) # Add fingerprints for fp, label in data.get('fingerprints'): # Transform json to Fingerprint fp = Fingerprint().from_dict(fp) # Get label label = fingerprints.get(fp, set()) | set([label]) # Set fingerprint fingerprints[fp] = label # Store fingerprints if necessary if store: for k, v in fingerprints.items(): self.fingerprints[k] = self.fingerprints.get(k, set()) | v # Return fingerprints return fingerprints
def check(origin, plagiarized): with open(origin, "r") as file: origin = file.read() with open(plagiarized, "r") as file: plagiarism = file.read() text_length = min(len(origin.split()), len(plagiarism.split())) if text_length < 60: raise NotImplementedError("Compare texts with at least 60 words.") window = max(text_length // 21, 3) kgram = window - 1 base = 11 if text_length < 250 else 23 if text_length < 600 else 101 modulo = max(round(text_length * 5, -3), 1000) fprint = Fingerprint(kgram_len=kgram, window_len=window, base=base, modulo=modulo) first = fprint.generate(str=origin) second = fprint.generate(str=plagiarism) similar = [ x for x in first if x in second ] similar_grams = Counter([ element[0] for element in first for sec in second if sec[0] == element[0] ]) print("Identical substring hashes:") pprint(similar) print("\nIdentical grams:") pprint(similar_grams)
def carrega_txt(): f = Fingerprint() file = open("teste_1.txt", 'r') list_content = file.read().strip().split("|") list_valid = [] for item in list_content: if item.strip(): try: list_valid.append(int(item)) except ValueError: pass print(list_valid) f.uploadCharacteristics(0x01, list_valid) f.uploadCharacteristics(0x02, list_valid) print(f.getTemplateCount()) print("Create Template -> " + str(f.createTemplate())) print("Store Template -> " + str(f.storeTemplate())) print(f.getTemplateCount())
def get_identity(): window = Window("Roblox", "https://www.roblox.com/account/signupredir") fp = Fingerprint( user_agent=user_agent, protochain_hash="5d76839801bc5904a4f12f1731a7b6d1", sec_fetch=True, content_type_value="application/x-www-form-urlencoded; charset=UTF-8", accept_language_value="en-US,en;q=0.9", jsbd_gen=lambda w: dict(HL=random.randint(1, 5), NCE=True, DT=w.title, NWD="undefined", DA=None, DR=None, DMT=random.randint(1, 40), DO=None, DOT=random.randint(30, 50)), DNT="unknown", L="en-US", D=24, PR=1, S="1920,1080", AS="1920,1040", SS=True, LS=True, IDB=True, B=False, ODB=True, CPUC="unknown", PK="Win32", JSF= "Arial,Arial Black,Arial Narrow,Book Antiqua,Bookman Old Style,Calibri,Cambria,Cambria Math,Century,Century Gothic,Century Schoolbook,Comic Sans MS,Consolas,Courier,Courier New,Garamond,Georgia,Helvetica,Impact,Lucida Bright,Lucida Calligraphy,Lucida Console,Lucida Fax,Lucida Handwriting,Lucida Sans,Lucida Sans Typewriter,Lucida Sans Unicode,Microsoft Sans Serif,Monotype Corsiva,MS Gothic,MS PGothic,MS Reference Sans Serif,MS Sans Serif,MS Serif,Palatino Linotype,Segoe Print,Segoe Script,Segoe UI,Segoe UI Light,Segoe UI Semibold,Segoe UI Symbol,Tahoma,Times,Times New Roman,Trebuchet MS,Verdana,Wingdings,Wingdings 2,Wingdings 3", P="Chrome PDF Plugin,Chrome PDF Viewer,Native Client", T="0,false,false", H="8", SWF=False) return fp, window
class Worker(object): def __init__(self, db): self.fgp_db = db self.fgp_api = Fingerprint() def mic_recognize(self, limit=None): if limit is None: limit = 10 print('Microphone listening for: {} seconds'.format(limit)) self.mic = AudioHelper() result = set() mic_data = self.mic.recognize(limit=limit) for num_channels, channel in enumerate(mic_data): hashes = self.fgp_api.fingerprint(channel, frame_rate=self.mic.samplerate, verbose=True, plot=True) result |= set(hashes) return result def fingerprint_worker(self, file_path, limit=None, grid_only=False, verbose=False, plot=False): #st = time.time() song_name, extension = os.path.splitext(file_path) # print('Fingerprinting: ', song_name, '\nFile extension: ', extension) # using different extraction method for mp3 if extension is '.mp3' or '.mpeg': # print(file_path) num_channels, frame_rate, audio_data = hlp.retrieve_audio_mpeg( file_path, limit) else: num_channels, frame_rate, audio_data = hlp.retrieve_audio( file_path, limit) #print('from fingerprint worker\n frame rate {}, data {}'.format(frame_rate, channels)) result = set() for num_channels, channel in enumerate(audio_data): # print('Channel number:', num_channels+1) hashes = self.fgp_api.fingerprint(channel, frame_rate=frame_rate, verbose=verbose, plot=plot) if grid_only: return self.fgp_api.fingerprint(channel, frame_rate=frame_rate, grid_only=grid_only, plot=plot) result |= set(hashes) #ft = time.time() - st #print('Elapsed fingerprinting time: ', ft) #print('Generated {} hashes'.format(len(result))) return song_name, result def insert_wav_to_db(self, song_n): #db.connect() song_name, list_hash = self.fingerprint_worker(song_n, limit=None) print('Song name: ', song_name) print('Number of generated hashes: ', len(list_hash)) self.fgp_db.insert_song(song_name, 1) for h in list_hash: self.fgp_db.insert_fingerprint(h[0], song_name, h[1]) def get_max_track_frequency(self, list_tracks): """Interates through a list of tuples (track, frequency of track) and returns the maximum value""" max_t_frequ = 0 for t in list_tracks.keys(): if list_tracks[t] > max_t_frequ: max_t_frequ = list_tracks[t] return max_t_frequ def align_matches_weighted(self, list_matches): candidates = dict() for tup in list_matches: track_name, time_delta = tup if time_delta not in candidates: candidates[time_delta] = dict() if track_name not in candidates[time_delta]: candidates[time_delta][track_name] = 1 else: candidates[time_delta][track_name] += 1 weighted_candidates = [] # each candidate is a tuple of (weight, (k,v)) # default weight = 1 # formula = (e ^ -(|time_delta|)) + max time delta value over a candidate list for k, v in candidates.items(): cand_weight = float(math.e**(-abs(k))) * 1000 max_t_freq = self.get_max_track_frequency(v) cand_tup = (cand_weight + max_t_freq, k, v) weighted_candidates.append(cand_tup) weighted_candidates = sorted(weighted_candidates, key=lambda weight: weight[0]) res = [elem for elem in weighted_candidates if elem[0] > 100.0] # escape case where list of candidates is empty if len(res) == 0: return { 'song id': 0, 'song name': 'No results found', 'is fingerprinted': 0 }, candidates, res prime_candidate = res[-1] prime_weight = prime_candidate[0] max_count = 0 query_track = '' # query the track with most hits for k, v in prime_candidate[2].items(): if v > max_count: max_count = v query_track = k query_hit, id, name, is_fng = self.fgp_db.get_song_by_name(query_track) # cut-off weight for candidates CUT_OFF_WEIGHT_1 = 368.87944117144235 CUT_OFF_WEIGHT_2 = 1010 if prime_weight <= CUT_OFF_WEIGHT_2 and max_count <= 10: track = { 'song id': 0, 'song name': 'No results found', 'is fingerprinted': 0, } return track, candidates, res track = { 'song id': id, 'song name': name, 'is fingerprinted': int(is_fng), } return track, candidates, res def fingerprint_songs(self, user_path='', num_tracks=None): dir_structure = self.build_dir_map(user_path) # get fingerprinted files number_fgp, already_fingerprinted = self.get_wavs_by_fgp(1) #print(already_fingerprinted) #print('Number of fingerprints=', number_fgp) song_counter = 0 # go through each file in the directory for file in dir_structure.keys(): # don't re-fingerprint files if file in already_fingerprinted: print('Skipping: {}'.format(file)) continue if song_counter == num_tracks: print('Added {} tracks to database.'.format(song_counter)) self.fgp_db.connection.close() return # path of dir + actual file path = dir_structure[file] + '\\' + file # avoid invalid extensions _pth, ext = os.path.splitext(path) if ext not in VALID_EXT: continue # insert song returns true if it managed, false otherwise res = self.fgp_db.insert_song(file, 1) if res: song_counter += 1 # generate and insert hashes _, list_hashes = self.fingerprint_worker(path) formatted_list = [] for h in list_hashes: # db.insert_fingerprint(h[0], file, h[1]) formatted_list.append((h[0], file, h[1])) res = self.fgp_db.dump_fingerprints(formatted_list) # stop everything in case of failure if not res: self.fgp_db.delete_songs([file]) print('Fingerprinting failed for: {}'.format([file])) return else: print('Fingerprinting skipped') continue print('Number of wavs: ', song_counter) def get_wavs_by_fgp(self, is_fgp=0): res = list(self.fgp_db.get_songs_by_fgp_status(is_fgp)) clean_list = [] for elem in res: temp = str(elem)[2:-3] clean_list.append(temp) # print(clean_list) number_of_tracks = len(clean_list) return number_of_tracks, clean_list ###################################################################### # # GRIDHASH ALGORITHM # ###################################################################### ##### DIRECTORY STRUCTURE METHODS ##### def _get_dir_structure(self, dir_path): """Returns all files from a specified directory""" files = [] for (dirpath, dirname, filenames) in os.walk(dir_path): files.append([dirpath, filenames]) return files def has_valid_extension(self, path_to_file): """Checks if file extension is valid Valid extensions: '.wav', '.ogg', '.mp3', '.flac', '.grid', '.mpeg' """ path, ext = os.path.splitext(path_to_file) if ext in VALID_EXT: return True return False def build_dir_map(self, root): """creates a dictionary directory structure. It maps files to their relative path. file.wav -> c//dir/dir2/dir_with_wavs Attributes: root - where to start looking Return: map - dictionary structure """ dir_struct = self._get_dir_structure(root) map = dict() for tup in dir_struct: current_directory = tup[0] files_in_dir = tup[1] for f in files_in_dir: path = os.path.join(current_directory, f) # add key if not already in dict and if file has a valid extension if f not in map and self.has_valid_extension(path): map[f] = current_directory return map ##### IO METHODS ##### def export_file(self, file_name, data, dest_dir=''): """Stores gridHash file to specified location Attributes: file_name - name of file data - information to package to the file dest_dir - file path """ name = file_name[:-4] + CUSTOM_EXT path = os.path.join(dest_dir, name) with open(path, mode='wb') as f: try: min_data = self.get_minHash(data) pickle.dump(min_data, f) f.close() print('Exported: {}'.format(name)) return True except: print('Export failed: {}'.format(name)) return False def load_grid(self, file_name, local_dir=''): """Loads gridHash file from specified location. Attributes: file_name - name of file to load local_dir - load path Return: data - retrieved information """ path = os.path.join(local_dir, file_name) filename, ext = os.path.splitext(path) if ext != CUSTOM_EXT: path = path[:-len(ext)] + CUSTOM_EXT with open(path, 'rb') as f: data = pickle.load(f) return data ##### minHash generators ###### def get_minHash(self, input_set): """Generates minHash object from input set Attributes: input_set - list of strings to minHash Returns: minHash object """ min_h = MinHash() for itm in input_set: min_h.update(itm.encode('utf8')) return min_h def export_many(self, files_in, files_out, limit=0): """Exports multiple gridHash objects""" # initialize counter for files to be indexed counter = 0 # build directory maps dir_map = self.build_dir_map(files_in) indexed = self.build_dir_map(files_out) # if no number of files is specified, process all files if limit == 0: limit = len(dir_map.keys()) print( 'Info:\n', 'There are {} available audio files.\n'.format( len(dir_map.keys())), 'There are {} available gridHash files.\n'.format( len(indexed.keys()))) # go file by file for tr in dir_map.keys(): if counter < limit: # check if the file has not already been exported pre = tr[:-4] + CUSTOM_EXT if pre not in indexed.keys(): _path = os.path.join(dir_map[tr], tr) # ensure a valid extension if self.has_valid_extension(_path): set_data = self.fingerprint_worker(_path, grid_only=True, plot=False) #print(tr, set_data) # generate gridhash res = self.export_file(tr, set_data, dest_dir=files_out) if res: counter += 1 else: return else: print('Skipping: {} file already exists'.format(tr)) print('Exported {} grids'.format(counter)) def compute_jaccard(self, s1, s2, grid_folder): """Computes jaccard distance between two gridHash files""" dir_map = self.build_dir_map(grid_folder) c1 = None c2 = None for itm in dir_map.keys(): if itm == s1: c1 = self.load_grid(itm, local_dir=grid_folder) if itm == s2: c2 = self.load_grid(itm, local_dir=grid_folder) sim = c1.jaccard(c2) return sim
from fingerprint import Fingerprint fp = Fingerprint() fp.clear_database()
def assign_nearest(self, X, y): """Set unassigned labels to that of nearest neighbours. Parameters ---------- X : np.array of shape=(n_flows,) Array of original flows. y : np.array of shape=(n_flows,) and dtype=int Array of fingerprints. Returns ------- result : np.array of shape=(n_flows,) and dtype=int Array of Fingerprints. Without any -1 labels. """ #################################################################### # Sort flows and fingerprints by timestamp # #################################################################### # Sort flows by time sort_time = np.argsort(X) sort_orig = np.argsort(sort_time) # Sort by time X = X[sort_time] y = y[sort_time] # Get timestamps timestamps = np.asarray([x.time_start for x in X]) #################################################################### # Assign closest fingerprints in time # #################################################################### # Get blocks of unassigned fingerprint indices blocks = list() block = list() for i, fingerprint in enumerate(y): if fingerprint and block: blocks.append(np.asarray(block)) block = list() elif not fingerprint: block.append(i) if block: blocks.append(np.asarray(block)) # For each block of unassigned fingerprints compute new labels for block in blocks: # Get indices before and after block before = min(block) - 1 after = max(block) + 1 # Get timestamps before and after block ts_before = X[before].time_start if before >= 0 else float('inf') ts_after = X[after].time_start if after < X.shape[0] else float( 'inf') # Get fingerprints before and after block fp_before = y[before] if before >= 0 else Fingerprint() fp_after = y[after] if after < X.shape[0] else Fingerprint() # Assign new fingerprints per block block_before = abs(timestamps[block] - ts_before) <\ abs(timestamps[block] - ts_after ) y[block[block_before]] = fp_before y[block[~block_before]] = fp_after # Return fingerprints in original order return y[sort_orig]
def __init__(self, db): self.fgp_db = db self.fgp_api = Fingerprint()
# -*- coding: utf-8 -*- """The main module finding similarity ratio between two strings.""" from fingerprint import Fingerprint from fingerprint.fingerprint import FingerprintException FINGERPRINT = Fingerprint(kgram_len=4, window_len=3, base=101, modulo=256) def find_similarity_ratio(f_string: str, s_string: str) -> float: """ Take two strings and find similarity between them using \ Rabin fingerprint and winnowing by Stanford. Args: `f_string`: first string.\n `s_string`: second string. Returns: `float`: the similarity ratio between two strings. """ try: f_string_fingerprint = FINGERPRINT.generate(str=f_string) s_string_fingerprint = FINGERPRINT.generate(str=s_string) except (FingerprintException, IndexError): return 0 f_string_only_hashes = [element[0] for element in f_string_fingerprint] s_string_only_hashes = [element[0] for element in s_string_fingerprint] common_hashes = set(f_string_only_hashes).intersection( set(s_string_only_hashes)) minimal_length_of_string_hashes = len( min(f_string_only_hashes, s_string_only_hashes, key=len))
def _fit_single_batch_(self, X, y=None): """Create fingerprints for a given batch of flows. Parameters ---------- X : array-like of shape=(n_samples_batch,) Samples (Flow objects) from which to generate fingerprints. y : array-like of shape=(n_samples_batch,), optional Labels corresponding to X. If given, they will be encorporated into each fingerprint. Returns ------- np.array of shape=(n_samples,) Resulting fingerprints corresponding to each flow. """ #################################################################### # Create fingerprints # #################################################################### # Create clustering instance cluster = Cluster() # Cluster flows into network destinations cluster.fit(X, y) # Find cliques in clusters cliques = CrossCorrelationGraph( window=self.window, # Set window size correlation=self.correlation # Set correlation threshold ).fit_predict(cluster) # Get cliques # Transform cliques to fingerprints fingerprints = list( Fingerprint(c) # Cast to fingerprint for c in cliques if len(c) > 1 # Only select cliques > 1 ) #################################################################### # Assign fingerprints per flow # #################################################################### # Get network destination per flow destinations = cluster.predict(X) # Get destination id per flow translation = cluster.cluster_dict() # Get destinations for each id destinations = [translation.get(d) for d in destinations] # Get fingerprint per network destination mapping_fingerprints = dict() # Map destination to largest fingerprint by (#destinations, #flows) for fingerprint in sorted(fingerprints): for destination in fingerprint: mapping_fingerprints[destination] = fingerprint # Apply mapping prediction = np.array([ mapping_fingerprints.get( x.destination, mapping_fingerprints.get(x.certificate, Fingerprint())) for x in X ]) #################################################################### # Handle unknown and similar fingerprints # #################################################################### # For unknown results assign nearest neighbour prediction = self.assign_nearest(X, prediction) # Merge similar fingerprints prediction = self.merge_fingerprints(prediction, self.similarity) # Return prediction return prediction
def getRes(b1, b2, cursor, quiet, attrs="hashes, langs", extra_selector="", fp_type=Fingerprint_Type.CROSS): if not quiet: print 'extra_selector="{}"'.format(extra_selector) global mask tuids = [] uids = [] cursor.execute("SELECT COUNT(DISTINCT(ip)) FROM {}".format(table_name)) if not quiet: print 'ip', cursor.fetchone()[0] cursor.execute( "SELECT COUNT(DISTINCT(user_id)) FROM {}".format(table_name)) if not quiet: print 'user', cursor.fetchone()[0] #cursor.execute("SELECT user_id FROM {} WHERE browser='{}'".format(table_name, b1)) cursor.execute("SELECT user_id FROM {} WHERE browser='{}' {}".format( table_name, b1, extra_selector)) for uid, in cursor.fetchall(): tuids.append(uid) if not quiet: print b1, len(tuids) for uid in tuids: #cursor.execute("SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}'".format(table_name, uid, b2)) cursor.execute( "SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}' {}". format(table_name, uid, b2, extra_selector)) for uid, in cursor.fetchall(): uids.append(uid) if not quiet: print b1, 'and', b2, len(uids) if len(uids) is 0: return None #uids is the list of users uses both b1 and b2 hash_all = {} hash_long = [] fp_to_count = {} hash_all_unique = {} stability = {} diff = {} index = [] uid_stability = {} for uid in uids: #cursor.execute("SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'".format(table_name, b1, uid)) cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b1, uid)) image1_id = cursor.fetchone()[0] #cursor.execute("SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'".format(table_name, b2, uid)) cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b2, uid)) image2_id = cursor.fetchone()[0] fp_1 = Fingerprint(cursor, image1_id, table_name, fp_type, attrs) fp_2 = Fingerprint(cursor, image2_id, table_name, fp_type, attrs) try: if quiet: _, opps = None cursor.execute("SELECT fonts FROM {} WHERE image_id='{}'".format( table_name, image1_id)) hashes_1 = list(cursor.fetchone()[0]) cursor.execute("SELECT fonts FROM {} WHERE image_id='{}'".format( table_name, image2_id)) hashes_2 = list(cursor.fetchone()[0]) if mask is None: mask = [1 for _ in range(len(hashes_1))] if len(hashes_1) == len(hashes_2): s1 = "" s2 = "" uid_stability.update({uid: []}) for i in range(len(hashes_1)): if i not in hash_all: hash_all.update({i: []}) if i not in hash_all_unique: hash_all_unique.update({i: Set()}) if i not in diff: diff.update({i: 0.0}) hash1_val = hashes_1[i] hash2_val = hashes_2[i] s1 += hash1_val s2 += hash2_val #if hash1_val == hash2_val and (hash1_val not in hash_all[i]): if hash1_val == hash2_val: hash_all[i].append(hash1_val) hash_all_unique[i].add(hash1_val) else: diff[i] += 1.0 / len(uids) uid_stability[uid].append([hash1_val, hash2_val]) except: pass if fp_1 == fp_2: #else: # print 'found: ' + str(uid) + '%' + str(uids[hash_long.index(s1)]) hash_long.append(fp_1) index.append(uid) if fp_1 in fp_to_count: fp_to_count[fp_1] += 1 else: fp_to_count.update({fp_1: 1}) #else: # print 'not same: ' + str(uid) #for i in range(case_number): # print i, diff[i] for i, d in diff.items(): if d > 0.0: mask[i] = 0 num_distinct = float(len(fp_to_count)) num_unique = 0.0 for _, count in fp_to_count.items(): if count == 1: num_unique += 1.0 num_cross_browser = float(len(hash_long)) num_uids = float(len(uids)) if not quiet: for i, d in diff.items(): print "{}: instability: {}".format(i, d) for u, s in uid_stability.items(): print "{}: {}".format(u, s) print 'Cross_browser', num_cross_browser print 'Cross_browser rate', num_cross_browser / num_uids print 'Cross_browser unique', num_unique / num_distinct print num_unique, num_distinct return int(num_uids), "{:3.1f}%".format( num_cross_browser / num_uids * 100), "{:3.1f}%".format( num_unique / num_distinct * 100)
print("Create Template -> " + str(f.createTemplate())) print("Store Template -> " + str(f.storeTemplate())) print(f.getTemplateCount()) def limpa_db(self): f = Fingerprint() print("Depois " + str(f.getTemplateCount())) f.limpa_bd() print("Antes " + str(f.getTemplateCount())) def enroll(self): pass f = Fingerprint() resposta = int( input( "1 - Registra_digital\n2 - Passa digital\n3 - Limpa bd\n4 - Dump API")) if (resposta == 1): f.registra_digital() elif (resposta == 2): f.valida_digital() elif (resposta == 3): f.limpa_bd() elif (resposta == 4): f.dump_bd() else: print("dunga burro aperta direito")
def __getRes(self, b1, b2, cursor, quiet, rate, table_name, attrs="", extra_selector=""): if not quiet: print('extra_selector="{}"'.format(extra_selector)) tuids = [] uids = [] cursor.execute("SELECT user_id FROM {} WHERE browser='{}' {}".format( table_name, b1, extra_selector)) for uid, in cursor.fetchall(): tuids.append(uid) if not quiet: print(b1, len(tuids)) for uid in tuids: cursor.execute( "SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}' {}" .format(table_name, uid, b2, extra_selector)) for uid, in cursor.fetchall(): uids.append(uid) if not quiet: print(b1, 'and', b2, len(uids)) #uids is the list of users uses both b1 and b2 hash_all = {} hash_long = [] fp_to_count = {} hash_all_unique = {} index = [] uid_stability = {} instability = {} mask = [1 for _ in range(28)] if len(uids) == 0: return 0, mask for uid in uids: cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b1, uid)) image1_id = cursor.fetchone()[0] cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b2, uid)) image2_id = cursor.fetchone()[0] try: # Feature to mask feature = "hashes" cursor.execute("SELECT {} FROM {} WHERE image_id='{}'".format( feature, table_name, image1_id)) hashes_1 = cursor.fetchone()[0].split("&")[:28] cursor.execute("SELECT {} FROM {} WHERE image_id='{}'".format( feature, table_name, image2_id)) hashes_2 = cursor.fetchone()[0].split("&")[:28] if len(hashes_1) == len(hashes_2): uid_stability.update({uid: []}) for i in range(len(hashes_1)): if i not in instability: instability.update({i: 0.0}) hash1_val = hashes_1[i] hash2_val = hashes_2[i] if hash1_val != hash2_val: instability[i] += 1.0 / len(uids) except: pass for index, i in instability.items(): if i > rate: mask[index] = 0 for uid in uids: cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b1, uid)) image1_id = cursor.fetchone()[0] cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b2, uid)) image2_id = cursor.fetchone()[0] fp_1 = Fingerprint(cursor, image1_id, table_name, Fingerprint_Type.CROSS, attrs, b2, mask) fp_2 = Fingerprint(cursor, image2_id, table_name, Fingerprint_Type.CROSS, attrs, b1, mask) if fp_1 == fp_2: hash_long.append(fp_1) if fp_1 in fp_to_count: fp_to_count[fp_1] += 1 else: fp_to_count.update({fp_1: 1}) num_distinct = max(float(len(fp_to_count)), 1.0) num_unique = 0.0 for _, count in fp_to_count.items(): if count == 1: num_unique += 1.0 num_cross_browser = max(float(len(hash_long)), 1.0) num_uids = max(float(len(uids)), 1.0) if not quiet: for i, d in instability.items(): print("{}: instability: {}".format(i, d)) print('Cross_browser', num_cross_browser) print('Cross_browser rate', num_cross_browser / num_uids) print('Cross_browser unique', num_unique / num_distinct) print(num_unique, num_distinct) return num_cross_browser / num_uids * num_unique / num_cross_browser * 100, mask
def fingerprint_function(url): f = Fingerprint(kgram_len=4, window_len=1, base=10, modulo=1000) return f.generate(str=url)
def get_consistent_ids(cur): """ Returns a list of user ids having only consistent fingerprints """ batch_size = 5000 attributes = Fingerprint.INFO_ATTRIBUTES + Fingerprint.HTTP_ATTRIBUTES + \ Fingerprint.JAVASCRIPT_ATTRIBUTES + Fingerprint.FLASH_ATTRIBUTES counter_to_os = dict() counter_to_browser = dict() id_to_oses = dict() id_to_browsers = dict() id_to_nb_inconsistencies = dict() id_to_nb_fps = dict() cur.execute('SELECT max(counter) as nb_fps from extensionDataScheme') nb_fps = cur.fetchone()["nb_fps"] + 1 for i in range(0, nb_fps, batch_size): print(i) sql = "SELECT * FROM extensionDataScheme where counter < %s and counter > %s" cur.execute(sql, (i + batch_size, i)) fps = cur.fetchall() for fp_dict in fps: try: fp = Fingerprint(attributes, fp_dict) counter_to_os[fp.getCounter()] = fp.getOs() counter_to_browser[fp.getCounter()] = fp.getBrowser() counter = fp.getCounter() if fp.getId() in id_to_oses: id_to_oses[fp.getId()].add(fp.getOs()) else: id_to_oses[fp.getId()] = set() id_to_oses[fp.getId()].add(fp.getOs()) if fp.getId() in id_to_browsers: id_to_browsers[fp.getId()].add(fp.getBrowser()) else: id_to_browsers[fp.getId()] = set() id_to_browsers[fp.getId()].add(fp.getBrowser()) if len(id_to_browsers[fp.getId()]) > 1 or len( id_to_oses[fp.getId()]) > 1: id_to_nb_inconsistencies[fp.getId()] = 100000000 if counter_to_os[counter] == "Android" or counter_to_os[counter] == "iOS" or \ counter_to_os[counter] == "Windows Phone" or counter_to_os[counter] == "Firefox OS" or \ counter_to_os[counter] == "Windows 95": id_to_nb_inconsistencies[fp.getId()] = 10000000000 if counter_to_browser[counter] == "Safari" or counter_to_browser[counter] == "IE" or \ counter_to_browser[counter] == "Edge" or counter_to_browser[counter] == "Googlebot": id_to_nb_inconsistencies[fp.getId()] = 10000000 if fp.hasPlatformInconsistency(): if fp.getId() in id_to_nb_inconsistencies: id_to_nb_inconsistencies[fp.getId()] += 5 else: id_to_nb_inconsistencies[fp.getId()] = 5 if fp.getId() in id_to_nb_fps: id_to_nb_fps[fp.getId()] += 1 else: id_to_nb_fps[fp.getId()] = 1 # Seems weird but made on purpose ! if fp.getId() not in id_to_nb_inconsistencies: id_to_nb_inconsistencies[fp.getId()] = 0 except: id_to_nb_inconsistencies[fp_dict["id"]] = 1000000 user_id_consistent = [ x for x in id_to_nb_fps if float(id_to_nb_inconsistencies[x]) / float(id_to_nb_fps[x]) < 0.02 ] # we remove user that poison their canvas # we select users that changed canvas too frequently cur.execute( "SELECT id, count(distinct canvasJSHashed) as count, count(canvasJSHashed) as \ nb_fps FROM extensionDataScheme group by id having count(distinct canvasJSHashed)/count(canvasJSHashed) > 0.35 \ and count(canvasJSHashed) > 5 order by id") rows = cur.fetchall() poisoner_ids = [row["id"] for row in rows] user_id_consistent = [ user_id for user_id in user_id_consistent if user_id not in poisoner_ids ] return user_id_consistent
def getRes(b1, b2, cursor, quiet, attrs="hashes, langs", extra_selector="", fp_type=Fingerprint_Type.CROSS): if not quiet: print('extra_selector="{}"'.format(extra_selector)) global mask global b_mask mask = None global instability tuids = [] uids = [] cursor.execute("SELECT COUNT(DISTINCT(ip)) FROM {}".format(table_name)) if not quiet: print('ip', cursor.fetchone()[0]) cursor.execute( "SELECT COUNT(DISTINCT(user_id)) FROM {}".format(table_name)) if not quiet: print('user', cursor.fetchone()[0]) #cursor.execute("SELECT user_id FROM {} WHERE browser='{}'".format(table_name, b1)) cursor.execute("SELECT user_id FROM {} WHERE browser='{}' {}".format( table_name, b1, extra_selector)) for uid, in cursor.fetchall(): tuids.append(uid) if not quiet: print(b1, len(tuids)) for uid in tuids: #cursor.execute("SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}'".format(table_name, uid, b2)) cursor.execute( "SELECT user_id FROM {} WHERE user_id='{}' AND browser='{}' {}". format(table_name, uid, b2, extra_selector)) for uid, in cursor.fetchall(): uids.append(uid) if not quiet: print(b1, 'and', b2, len(uids)) if len(uids) is 0: return None #uids is the list of users uses both b1 and b2 hash_all = {} hash_long = [] fp_to_count = {} hash_all_unique = {} index = [] uid_stability = {} instability = {} for uid in uids: cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b1, uid)) image1_id = cursor.fetchone()[0] cursor.execute( "SELECT image_id FROM {} WHERE browser='{}' AND user_id='{}'". format(table_name, b2, uid)) image2_id = cursor.fetchone()[0] fp_1 = Fingerprint(cursor, image1_id, table_name, fp_type, attrs, b2) fp_2 = Fingerprint(cursor, image2_id, table_name, fp_type, attrs, b1) try: # Feature to mask feature = "fonts" cursor.execute("SELECT {} FROM {} WHERE image_id='{}'".format( feature, table_name, image1_id)) hashes_1 = cursor.fetchone()[0] cursor.execute("SELECT {} FROM {} WHERE image_id='{}'".format( feature, table_name, image2_id)) hashes_2 = cursor.fetchone()[0] if mask is None: mask = [1 for _ in range(len(hashes_1))] if len(hashes_1) == len(hashes_2): s1 = "" s2 = "" uid_stability.update({uid: []}) for i in range(len(hashes_1)): if i not in hash_all: hash_all.update({i: []}) if i not in hash_all_unique: hash_all_unique.update({i: Set()}) if i not in instability: instability.update({i: 0.0}) hash1_val = hashes_1[i] hash2_val = hashes_2[i] s1 += hash1_val s2 += hash2_val if hash1_val == hash2_val: hash_all[i].append(hash1_val) hash_all_unique[i].add(hash1_val) else: instability[i] += 1.0 / len(uids) uid_stability[uid].append([hash1_val, hash2_val]) except: pass if fp_1 == fp_2: hash_long.append(fp_1) index.append(uid) if fp_1 in fp_to_count: fp_to_count[fp_1] += 1 else: fp_to_count.update({fp_1: 1}) print 'hashall:' + str(len(hash_all)) for index, i in instability.items(): if i > 0.001: mask[index] = 0 num_distinct = max(float(len(fp_to_count)), 1.0) num_unique = 0.0 for _, count in fp_to_count.items(): if count == 1: num_unique += 1.0 num_cross_browser = float(len(hash_long)) num_uids = max(float(len(uids)), 1.0) if not quiet: for i, d in instability.items(): print("{}: instability: {}".format(i, d)) print('Cross_browser', num_cross_browser) print('Cross_browser rate', num_cross_browser / num_uids) print('Cross_browser unique', num_unique / num_distinct) print(num_unique, num_distinct) return int(num_uids), "{:3.1f}%".format( num_cross_browser / num_uids * 100), "{:3.1f}%".format( num_unique / num_distinct * 100)
def limpa_db(self): f = Fingerprint() print("Depois " + str(f.getTemplateCount())) f.limpa_bd() print("Antes " + str(f.getTemplateCount()))
def get_fingerprint(self, fingerprint_id): return Fingerprint( self.collection.find({"_id": ObjectId(fingerprint_id)})[0])
if message.topic == "enroll/begin": fp.abort = True data = json.loads(message.payload) global search_thread search_thread.join() fp.enroll(data['identificacion']) if fp.abort: client.publish("enroll/abort", "") client.publish("search/finished", "") fp.abort = False if message.topic == "delete": fp.abort = True global search_thread search_thread.join() fp.delete(message.payload) client.publish("search/finished", "") fp.abort = False if message.topic == "search/finished": search_thread = threading.Thread(target=fp.search) search_thread.start() fp = Fingerprint() client = paho.Client("routine") client.connect("localhost") client.on_message = on_message client.subscribe("search/finished") client.subscribe("enroll/begin") client.subscribe("delete") client.loop_forever()