def gen_new_matrix_of_users(dictio_of_skypes, dictio_of_users, dictio_of_values): num_users = len(dictio_of_users) num_skypes = len(dictio_of_skypes) # Transform dictionary to indexes for indk, skype in enumerate(dictio_of_skypes.keys()): dictio_of_skypes[skype] = indk #Transform users to matrices. print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_skypes * 1 / (1024**3))) matrix_map = np.memmap('skype_files/skype_matrix_map.dat', dtype=np.uint8, mode='w+', shape=(num_users, num_skypes)) status.create_numbar(100, num_users) for ind, user in enumerate(dictio_of_users.keys()): status.update_numbar(ind, num_users) base = np.zeros((num_skypes, 1), dtype=np.uint8) for skype in dictio_of_users[user]: x = dictio_of_values[skype] base[dictio_of_skypes[skype]] = x base = np.squeeze(base) matrix_map[ind] = base[:] status.end_numbar() print("Flushing...") matrix_map.flush()
def process_skypes_distance(dictio_of_users, dictio_of_skypes): tic = time.time() num_users = len(dictio_of_users) num_skypes = len(dictio_of_skypes) matrix_map = np.memmap('skype_files/skype_matrix_map.dat', dtype=np.uint8, shape=(num_users, num_skypes)) #matrix_map = np.array(matrix_map) print(matrix_map.shape) print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_users * 4 / (1024**3))) status.create_numbar(100, num_users) #res = np.dot(matrix_map, matrix_map.T) #res2 = np.memmap('skype_files/skype_euc_score.dat', dtype=float ,mode ='w+', shape=(num_users, num_users)) res3 = np.memmap('skype_files/skype_dis_score.dat', dtype=np.uint32, mode='w+', shape=(num_users, num_users)) for i1 in range(num_users): status.update_numbar(i1, num_users) v1 = np.array(matrix_map[i1], dtype=np.uint32) for i2 in range(i1 + 1, num_users): v2 = np.array(matrix_map[i2], dtype=np.uint32) #print(v1) #euc_score = np.linalg.norm(v1-v2) dis_score = np.dot(v1, v2) #res2[i1][i2] = euc_score res3[i1][i2] = dis_score #res2[i2][i1] = score status.end_numbar() print("Flushing...") res3.flush() print("SECONDS: %f" % (time.time() - tic))
def gen_vector_for_pairs(word_dictio, ind_features, weights): words_per_user = len(word_dictio) num_words = words_per_user * 2 + 1 num_pairs = len(weights) print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_words * num_pairs * 4 / (1024 ** 3))) matrix_map = np.memmap('input_keras.dat', dtype=np.single, mode ='w+', shape=(num_pairs, num_words)) status.create_numbar(100, num_pairs) for i in range(0, num_pairs): status.update_numbar(i, num_pairs) v = np.zeros((num_words)) u1, u2 = weights[i][0], weights[i][1] for j in range(len(ind_features[u1][4::2])): a = ind_features[u1][2 * j + 3] b = ind_features[u1][2 * j + 4] word_index = int(word_dictio[a]) #print(a, b, word_index) v[word_index] = b for j in range(len(ind_features[u1][4::2])): a = ind_features[u1][2 * j + 3] b = ind_features[u1][2 * j + 4] word_index = int(word_dictio[a]) v[words_per_user + word_index] = b v[num_words - 1] = weights[i][2] print(v.shape, matrix_map.shape, matrix_map[i].shape) matrix_map[i] = v[:] status.end_numbar()
def join_all_results(self): dictio_of_results = dict() tic = time.time() toc = time.time() for indi, tup in enumerate(self.list_files): filename, prefix = tup[0], tup[1] print("[-] Going for file: %d - %s" % (indi, filename)) lst_results = read_csv_list(filename)[1:] filelen = len(lst_results) print("[+] Sorting list") lst_results = sorted(lst_results, key=lambda x: x[0] + x[1], reverse=False) status.create_numbar(100, filelen) for indj, entry in enumerate(lst_results): status.update_numbar(indj, filelen) if not entry[0] in dictio_of_results.keys(): dictio_of_results[entry[0]] = dict() if not entry[1] in dictio_of_results[entry[0]].keys(): dictio_of_results[entry[0]][entry[1]] = dict() dictio_of_results[entry[0]][entry[1]][prefix] = float(entry[2]) status.end_numbar() print("[+] Ended with file: %d - %s in %d seconds" % (indi, filename, time.time() - tic)) return dictio_of_results
def get_max(): maximum = 0 k1s, k2s, k3s = None, None, None status.create_numbar(100, len(dictio_of_results)) for indi, k1 in enumerate(list(dictio_of_results.keys())): status.update_numbar(indi, len(dictio_of_results)) for k2 in dictio_of_results[k1].keys(): for k3 in dictio_of_results[k1][k2].keys(): if dictio_of_results[k1][k2][k3] > maximum: maximum = dictio_of_results[k1][k2][k3] k1s, k2s, k3s = k1, k2, k3 status.end_numbar() return maximum, k1s, k2s, k3s
def get_valid_btc_addresses(dictio_of_btcs): tic = time.time() num_requests = 0 results = [] status.create_numbar(100, len(dictio_of_btcs)) for indi, i in enumerate(dictio_of_btcs.keys()): status.update_numbar(indi, len(dictio_of_btcs)) appears = 1 if (not (get_first_seen(i) is None)) else 0 results.append((i, appears)) time.sleep(1) status.end_numbar() gen_csv_from_tuples("btc_files/valid_btcs.csv", ["BTC", "Valid"], results)
def gen_dictio_from_csv(): global dictio_of_results lst = read_csv_list("combination.csv") headers, lst = lst[0][2:], lst[1:] status.create_numbar(100, len(lst)) for indi, i in enumerate(lst): status.update_numbar(indi, len(lst)) if not i[0] in dictio_of_results.keys(): dictio_of_results[i[0]] = {} if not i[1] in dictio_of_results[i[0]].keys(): dictio_of_results[i[0]][i[1]] = {} for indj, j in enumerate(i[2:]): dictio_of_results[i[0]][i[1]][headers[indj]] = j status.end_numbar() return lst
def new_read_results_euc(dictio_of_users, dictio_of_skypes): print("New user extraction...") tic = time.time() num_users = len(dictio_of_users) lst_users = list(dictio_of_users.keys()) lst = [] res2 = np.memmap('skype_files/skype_dis_score.dat', dtype=np.uint32, mode='r', shape=(num_users, num_users)) status.create_numbar(100, num_users) for i in range(num_users): status.update_numbar(i, num_users) for j in range(i + 1, num_users): if res2[i][j] > 0: lst.append((lst_users[i], lst_users[j], res2[i][j])) status.end_numbar() print("SECONDS: %f" % (time.time() - tic)) print("Old length: %d" % (len(lst))) lst = [x for x in lst if x[2] > 0] print("New length: %d" % (len(lst))) sortedl = sorted(lst, key=lambda x: x[2], reverse=True) print("SECONDS: %f" % (time.time() - tic)) gen_csv_from_tuples("skype_files/new_results_skype_dis.csv", ["IdAuthor1", "IdAuthor2", "Score"], sortedl) lst = [] res3 = np.memmap('skype_files/skype_euc_score.dat', dtype=float, mode='r', shape=(num_users, num_users)) dic_inv = {user: indi for indi, user in enumerate(lst_users)} status.create_numbar(100, len(sortedl)) for indi, row in enumerate(sortedl): status.update_numbar(indi, len(sortedl)) u1, u2 = row[0], row[1] ind1, ind2 = dic_inv[u1], dic_inv[u2] lst.append((u1, u2, res3[ind1][ind2])) status.end_numbar() print("SECONDS: %f" % (time.time() - tic)) sortedl = sorted(lst, key=lambda x: x[2], reverse=False) print(sortedl[:10]) print("SECONDS: %f" % (time.time() - tic)) gen_csv_from_tuples("skype_files/new_results_skype_euc.csv", ["IdAuthor1", "IdAuthor2", "Score"], sortedl)
def read_results_euc(dictio_of_users, dictio_of_links): tic = time.time() num_users = len(dictio_of_users) lst_users = list(dictio_of_users.keys()) lst = [] res2 = np.memmap('link_files/link_euc_score.dat', dtype=float, mode ='r', shape=(num_users, num_users)) status.create_numbar(100, num_users) for i in range(num_users): status.update_numbar(i, num_users) for j in range(i + 1, num_users): lst.append((lst_users[i],lst_users[j],res2[i][j])) status.end_numbar() print("SECONDS: %f" %(time.time() - tic)) sortedl = sorted(lst, key=lambda x: x[2], reverse=False) print(sortedl[:10]) print("SECONDS: %f" %(time.time() - tic)) gen_csv_from_tuples("link_files/results_link_euc.csv", ["IdAuthor1", "IdAuthor2", "Score"], sortedl)
def store_dictio_to_file(): global dictio_of_results lst = [] headers = ['user_a', 'user_b' ] + [x + y for x in lst_res for y in ["_euc", "_dis"]] for indi, k1 in enumerate(list(dictio_of_results.keys())): status.update_numbar(indi, len(dictio_of_results)) for k2 in dictio_of_results[k1].keys(): temp_lst = [k1, k2] for k3 in lst_res: for k4 in ["_euc", "_dis"]: if (k3 + k4) in dictio_of_results[k1][k2].keys(): temp_lst.append(dictio_of_results[k1][k2][k3 + k4]) else: temp_lst.append(-1) lst.append(tuple(temp_lst)) gen_csv_from_tuples("combination.csv", headers, lst)
def process_emails_euclidean(dictio_of_users, dictio_of_emails): tic = time.time() num_users = len(dictio_of_users) num_emails = len(dictio_of_emails) matrix_map = np.memmap('email_files/email_matrix_map.dat', dtype=np.uint8, shape=(num_users, num_emails)) lst_users = list(dictio_of_users.keys()) #matrix_map = np.array(matrix_map) lst_res = [] print(matrix_map.shape) print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_users * 4 / (1024**3))) status.create_numbar(100, num_users) #res = np.dot(matrix_map, matrix_map.T) res2 = np.memmap('email_files/email_euc_score.dat', dtype=float, mode='w+', shape=(num_users, num_users)) #res3 = np.memmap('email_files/email_dis_score.dat', dtype=np.uint32 ,mode ='w+', shape=(num_users, num_users)) for i1 in range(num_users): status.update_numbar(i1, num_users) v1 = np.array(matrix_map[i1], dtype=np.int32) #v1p = gen_vector_for_user(list(dictio_of_users.keys())[i1], dictio_of_users, dictio_of_emails) #print(np.array_equal(v1, v1p)) for i2 in range(i1 + 1, num_users): v2 = np.array(matrix_map[i2], dtype=np.int32) #print(v1.shape, v2.shape) #print(v1) euc_score = np.linalg.norm(v1 - v2) #dis_score = np.dot(v1, v2) lst_res.append((lst_users[i1], lst_users[i2], euc_score)) res2[i1][i2] = euc_score #res3[i1][i2] = dis_score #res2[i2][i1] = score lst_res = sorted(lst_res, key=lambda x: x[2], reverse=False) gen_csv_from_tuples("email_files/results_email_euc_alt.csv", ["IdAuthor1", "IdAuthor2", "Score"], lst_res) status.end_numbar() print("Flushing...") res2.flush() print("SECONDS: %f" % (time.time() - tic))
def join_euc_results_csv(): global dictio_of_results limit = 1000000 # 1 million #dictio_of_results = {} for res in lst_res: tic = time.time() filename = res + "_files/new_results_" + res + "_euc.csv" # Check if file exists if not check_file_attr(filename): continue print("[-] Getting file length...") #filelen = file_len(filename) filelen = dictio_len[res] indi = 0 status.create_numbar(100, filelen) with open(filename) as f: for line in csv.reader(f, delimiter=',', quotechar="'", quoting=csv.QUOTE_MINIMAL): indi += 1 status.update_numbar(indi, filelen) if indi == 1: #The first line of the csv are titles continue x = tuple(line) i = (x[0], x[1], float(x[2])) #if (i[2] > limit): #If we surpass the limit we stop searching since it is ordered #break if not i[0] in dictio_of_results.keys(): dictio_of_results[i[0]] = {} if not i[1] in dictio_of_results[i[0]].keys(): dictio_of_results[i[0]][i[1]] = {} dictio_of_results[i[0]][i[1]][res + "_euc"] = i[2] status.end_numbar() print("[-] SECONDS: %f" % (time.time() - tic)) print("[-] Length of dictionary: %d" % (len(dictio_of_results)))
def gen_bin_matrix_of_users(dictio_of_links, dictio_of_users): num_users = len(dictio_of_users) num_links = len(dictio_of_links) # Transform dictionary to indexes for indk, link in enumerate(dictio_of_links.keys()): dictio_of_links[link] = indk #Transform users to matrices. print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_links * 1 / (1024 ** 3))) matrix_map = np.memmap('link_files/link_matrix_map.dat', dtype=np.uint8, mode ='w+', shape=(num_users, num_links)) status.create_numbar(100, num_users) for ind, user in enumerate(dictio_of_users.keys()): status.update_numbar(ind, num_users) base = np.zeros((num_links,1), dtype=np.uint8) for link in dictio_of_users[user]: base[dictio_of_links[link]] = 1 base = np.squeeze(base) matrix_map[ind] = base[:] status.end_numbar() print("Flushing...") matrix_map.flush()
def join_all_results_alt(self): dictio_of_results = dict() tic = time.time() toc = time.time() for indi, tup in enumerate(self.list_files): filename, prefix = tup[0], tup[1] print("[-] Going for file: %d - %s" % (indi, filename)) filelen = 102800000 with open(filename, 'r') as f: indj = 0 status.create_numbar(100, filelen) for line in csv.reader(f, delimiter=',', quotechar="'", quoting=csv.QUOTE_MINIMAL): indj += 1 if indj == 1: #The first line of the csv are titles continue entry = tuple(line) #lst_results = read_csv_list(filename)[1:] user1, user2 = self.order_users(entry) status.update_numbar(indj, filelen) if '-1' in user1 or '-1' in user2: continue if not user1 in dictio_of_results.keys(): dictio_of_results[user1] = dict() if not user2 in dictio_of_results[user1].keys(): dictio_of_results[user1][user2] = dict() dictio_of_results[user1][user2][prefix + "coin"] = float( entry[2]) dictio_of_results[user1][user2][prefix + "uniq"] = float( entry[3]) status.end_numbar() #notify_mail("[+] Ended with file: %d - %s in %d seconds" % (indi, filename, time.time() - tic)) return dictio_of_results
def link_value_removal_2(dictio_of_users, dictio_of_values): print("[+] Highlighting links to other forums...") dictio_of_sites = { "hackforums.net": 0, "mpgh.net": 4, "raidforums.com": 12, "antichat.ru": 10, "blackhatworld.com": 8, "garage4hackers.com": 7, "greysec.net": 6, "stresserforums.net": 5, "kernelmode.info": 1, "safeskyhacks.com": 13, "offensivecommunity.net": 3 } link_list = [] count_external_refs = 0 status.create_numbar(100, len(dictio_of_values)) for index, elem in enumerate(dictio_of_values.items()): status.update_numbar(index, len(dictio_of_values)) link, users = elem[0], elem[1] for link_site, site_num in dictio_of_sites.items(): if link_site in link: #print(link_site, link) ext_reference = False for user in users: user_site = int(user[len(user) - user[::-1].find('['):-1]) if site_num != user_site: #print(users) ext_reference = True break if not ext_reference: link_list.append(link) break status.end_numbar() return link_list
def get_most_important(): lst = read_csv_list("weighted_average.csv")[1:] dictio_lst = {} status.create_numbar(100, len(lst_res2)) for indi, i in enumerate(lst_res2): status.update_numbar(indi, len(lst_res2)) dictio_lst[i] = read_csv_list(i + "_files/user_to_" + i + ".csv")[1:] status.end_numbar() final_lst = [] num = 100 status.create_numbar(100, num) for indi, i in enumerate(lst[:num]): status.update_numbar(indi, num) userilist = [i[0], i[1], i[2]] for key in lst_res2: u1, u2 = find_user_list(i[0], i[1], dictio_lst[key]) if u1 is None or u2 is None: continue userilist += list(u1.intersection(u2)) final_lst.append(tuple(userilist)) status.end_numbar() gen_csv_from_tuples("croos_val.csv", ['user_a', 'user_b', 'metric', 'similar_vals'], final_lst)
def clean_usernames(): lst = read_csv_list("similar_usernames_full.csv")[1:] lst2 = read_csv_list("all_posts_all_users.csv") lst = [x for x in lst if x[0] != ''] lst2 = [x for x in lst2 if x[0] != ''] total = 0 count = 0 dictio = {} results = [] threshold = 20 for i in lst2: #print (i[0:2], i[2]) dictio[i[0]] = {} for i in lst2: #print (i[0:2], i[2]) dictio[i[0]][i[1]] = int(i[2]) not_both = [x for x in lst if x[0] not in dictio.keys()] print(len(dictio), len(list(set([x[0] for x in lst]))), len(not_both), len(not_both)+ len(dictio)) for i in lst: boolean = True for j in i[1:]: if i[0] in dictio.keys() and dictio[i[0]][j] < threshold: boolean = False if boolean: #print (i) results += [i] else: del dictio[i[0]] for i in results: if (len (i) > 3): count += 1 if (len (i) > 4): total += 1 print ("At least two: %d" % (len(dictio)), "At least three: %d" % (count) , "At least four: %d" % (total)) conn = psycopg2.connect(database="crimebb", user=db_username, password=db_password, host="127.0.0.1", port="5432") print("Database Connected....") rows_processed = [] status.create_numbar(100, len(dictio)) for indi, i in enumerate(dictio.keys()): #print (multiprocessing.current_process(), "%0.2f %%" % ( indi * 100 / len(lst))) status.update_numbar(indi, len(dictio)) for key in dictio[i].keys(): cur = conn.cursor() cur.execute("""SELECT "Post"."Content" from "Post" JOIN "Member" ON "Post"."Author" = "Member"."IdMember" WHERE ("Member"."Username" = %s) AND "Member"."Site" = %s;""", (i, int(key))) rows = [row[0] for row in cur.fetchall()] #print (rows[0]) tfidf = tf_idf(rows) tfidf = sorted(tfidf, key=lambda x: x[-1], reverse=True) #print(i, key, tfidf[:3]) tfidf = [i for j in tfidf[:50] for i in j] dictio[i][key] = tuple(tfidf) #print (i[0], j, count) if indi == 100: k = list(dictio.keys()) rows_processed = [(user, forum) + dictio[user][forum] for user in k[:100] for forum in dictio[user].keys()] gen_csv_from_tuples("tfidf_prov.csv", [""], rows_processed) status.end_numbar() rows_processed = [(user, forum) + dictio[user][forum] for user in dictio.keys() for forum in dictio[user].keys()] gen_csv_from_tuples("tfidf.csv", [""], rows_processed) conn.close()