예제 #1
0
def gen_new_matrix_of_users(dictio_of_skypes, dictio_of_users,
                            dictio_of_values):
    num_users = len(dictio_of_users)
    num_skypes = len(dictio_of_skypes)
    # Transform dictionary to indexes
    for indk, skype in enumerate(dictio_of_skypes.keys()):
        dictio_of_skypes[skype] = indk
    #Transform users to matrices.
    print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_skypes * 1 /
                                               (1024**3)))
    matrix_map = np.memmap('skype_files/skype_matrix_map.dat',
                           dtype=np.uint8,
                           mode='w+',
                           shape=(num_users, num_skypes))
    status.create_numbar(100, num_users)
    for ind, user in enumerate(dictio_of_users.keys()):
        status.update_numbar(ind, num_users)
        base = np.zeros((num_skypes, 1), dtype=np.uint8)
        for skype in dictio_of_users[user]:
            x = dictio_of_values[skype]
            base[dictio_of_skypes[skype]] = x
        base = np.squeeze(base)
        matrix_map[ind] = base[:]
    status.end_numbar()
    print("Flushing...")
    matrix_map.flush()
예제 #2
0
def process_skypes_distance(dictio_of_users, dictio_of_skypes):
    tic = time.time()
    num_users = len(dictio_of_users)
    num_skypes = len(dictio_of_skypes)

    matrix_map = np.memmap('skype_files/skype_matrix_map.dat',
                           dtype=np.uint8,
                           shape=(num_users, num_skypes))
    #matrix_map = np.array(matrix_map)

    print(matrix_map.shape)
    print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_users * 4 /
                                               (1024**3)))
    status.create_numbar(100, num_users)
    #res = np.dot(matrix_map, matrix_map.T)
    #res2 = np.memmap('skype_files/skype_euc_score.dat', dtype=float ,mode ='w+', shape=(num_users, num_users))
    res3 = np.memmap('skype_files/skype_dis_score.dat',
                     dtype=np.uint32,
                     mode='w+',
                     shape=(num_users, num_users))
    for i1 in range(num_users):
        status.update_numbar(i1, num_users)
        v1 = np.array(matrix_map[i1], dtype=np.uint32)
        for i2 in range(i1 + 1, num_users):
            v2 = np.array(matrix_map[i2], dtype=np.uint32)
            #print(v1)
            #euc_score = np.linalg.norm(v1-v2)
            dis_score = np.dot(v1, v2)
            #res2[i1][i2] = euc_score
            res3[i1][i2] = dis_score
            #res2[i2][i1] = score
    status.end_numbar()
    print("Flushing...")
    res3.flush()
    print("SECONDS: %f" % (time.time() - tic))
예제 #3
0
def gen_vector_for_pairs(word_dictio, ind_features, weights):
	
	words_per_user = len(word_dictio)
	num_words = words_per_user * 2 + 1
	num_pairs = len(weights)
	print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_words * num_pairs * 4 / (1024 ** 3)))
	matrix_map = np.memmap('input_keras.dat', dtype=np.single, mode ='w+', shape=(num_pairs, num_words))
	status.create_numbar(100, num_pairs)
	for i in range(0, num_pairs):

		status.update_numbar(i, num_pairs)
		
		v = np.zeros((num_words))
		u1, u2 = weights[i][0], weights[i][1]
		for j in range(len(ind_features[u1][4::2])):
			a = ind_features[u1][2 * j + 3]
			b = ind_features[u1][2 * j + 4]
			word_index = int(word_dictio[a])
			#print(a, b, word_index)
			v[word_index] = b
		for j in range(len(ind_features[u1][4::2])):
			a = ind_features[u1][2 * j + 3]
			b = ind_features[u1][2 * j + 4]
			word_index = int(word_dictio[a])
			v[words_per_user + word_index] = b
		v[num_words - 1] = weights[i][2]
		print(v.shape, matrix_map.shape, matrix_map[i].shape)
		matrix_map[i] = v[:]
	status.end_numbar()
예제 #4
0
    def join_all_results(self):
        dictio_of_results = dict()
        tic = time.time()
        toc = time.time()
        for indi, tup in enumerate(self.list_files):
            filename, prefix = tup[0], tup[1]
            print("[-] Going for file: %d - %s" % (indi, filename))

            lst_results = read_csv_list(filename)[1:]
            filelen = len(lst_results)
            print("[+] Sorting list")
            lst_results = sorted(lst_results,
                                 key=lambda x: x[0] + x[1],
                                 reverse=False)
            status.create_numbar(100, filelen)
            for indj, entry in enumerate(lst_results):

                status.update_numbar(indj, filelen)
                if not entry[0] in dictio_of_results.keys():
                    dictio_of_results[entry[0]] = dict()
                if not entry[1] in dictio_of_results[entry[0]].keys():
                    dictio_of_results[entry[0]][entry[1]] = dict()

                dictio_of_results[entry[0]][entry[1]][prefix] = float(entry[2])

            status.end_numbar()

            print("[+] Ended with file: %d - %s in %d seconds" %
                  (indi, filename, time.time() - tic))

        return dictio_of_results
예제 #5
0
def get_max():
    maximum = 0
    k1s, k2s, k3s = None, None, None
    status.create_numbar(100, len(dictio_of_results))
    for indi, k1 in enumerate(list(dictio_of_results.keys())):
        status.update_numbar(indi, len(dictio_of_results))
        for k2 in dictio_of_results[k1].keys():
            for k3 in dictio_of_results[k1][k2].keys():
                if dictio_of_results[k1][k2][k3] > maximum:
                    maximum = dictio_of_results[k1][k2][k3]
                    k1s, k2s, k3s = k1, k2, k3
    status.end_numbar()
    return maximum, k1s, k2s, k3s
예제 #6
0
def get_valid_btc_addresses(dictio_of_btcs):
    tic = time.time()
    num_requests = 0
    results = []
    status.create_numbar(100, len(dictio_of_btcs))
    for indi, i in enumerate(dictio_of_btcs.keys()):
        status.update_numbar(indi, len(dictio_of_btcs))
        appears = 1 if (not (get_first_seen(i) is None)) else 0
        results.append((i, appears))
        time.sleep(1)

    status.end_numbar()
    gen_csv_from_tuples("btc_files/valid_btcs.csv", ["BTC", "Valid"], results)
예제 #7
0
def gen_dictio_from_csv():
    global dictio_of_results
    lst = read_csv_list("combination.csv")
    headers, lst = lst[0][2:], lst[1:]
    status.create_numbar(100, len(lst))
    for indi, i in enumerate(lst):
        status.update_numbar(indi, len(lst))
        if not i[0] in dictio_of_results.keys():
            dictio_of_results[i[0]] = {}
        if not i[1] in dictio_of_results[i[0]].keys():
            dictio_of_results[i[0]][i[1]] = {}

        for indj, j in enumerate(i[2:]):
            dictio_of_results[i[0]][i[1]][headers[indj]] = j
    status.end_numbar()
    return lst
예제 #8
0
def new_read_results_euc(dictio_of_users, dictio_of_skypes):
    print("New user extraction...")
    tic = time.time()
    num_users = len(dictio_of_users)
    lst_users = list(dictio_of_users.keys())
    lst = []
    res2 = np.memmap('skype_files/skype_dis_score.dat',
                     dtype=np.uint32,
                     mode='r',
                     shape=(num_users, num_users))
    status.create_numbar(100, num_users)
    for i in range(num_users):
        status.update_numbar(i, num_users)
        for j in range(i + 1, num_users):
            if res2[i][j] > 0:
                lst.append((lst_users[i], lst_users[j], res2[i][j]))
    status.end_numbar()
    print("SECONDS: %f" % (time.time() - tic))
    print("Old length: %d" % (len(lst)))
    lst = [x for x in lst if x[2] > 0]
    print("New length: %d" % (len(lst)))
    sortedl = sorted(lst, key=lambda x: x[2], reverse=True)
    print("SECONDS: %f" % (time.time() - tic))
    gen_csv_from_tuples("skype_files/new_results_skype_dis.csv",
                        ["IdAuthor1", "IdAuthor2", "Score"], sortedl)

    lst = []
    res3 = np.memmap('skype_files/skype_euc_score.dat',
                     dtype=float,
                     mode='r',
                     shape=(num_users, num_users))
    dic_inv = {user: indi for indi, user in enumerate(lst_users)}
    status.create_numbar(100, len(sortedl))

    for indi, row in enumerate(sortedl):
        status.update_numbar(indi, len(sortedl))
        u1, u2 = row[0], row[1]
        ind1, ind2 = dic_inv[u1], dic_inv[u2]
        lst.append((u1, u2, res3[ind1][ind2]))
    status.end_numbar()
    print("SECONDS: %f" % (time.time() - tic))
    sortedl = sorted(lst, key=lambda x: x[2], reverse=False)
    print(sortedl[:10])
    print("SECONDS: %f" % (time.time() - tic))
    gen_csv_from_tuples("skype_files/new_results_skype_euc.csv",
                        ["IdAuthor1", "IdAuthor2", "Score"], sortedl)
예제 #9
0
def read_results_euc(dictio_of_users, dictio_of_links):
	tic = time.time()
	
	num_users = len(dictio_of_users)
	lst_users = list(dictio_of_users.keys())
	lst = []
	res2 = np.memmap('link_files/link_euc_score.dat', dtype=float, mode ='r', shape=(num_users, num_users))
	status.create_numbar(100, num_users)
	for i in range(num_users):
		status.update_numbar(i, num_users)
		for j in range(i + 1, num_users):
			lst.append((lst_users[i],lst_users[j],res2[i][j]))
	status.end_numbar()
	print("SECONDS: %f" %(time.time() - tic))
	sortedl = sorted(lst, key=lambda x: x[2], reverse=False)
	print(sortedl[:10])
	print("SECONDS: %f" %(time.time() - tic))
	gen_csv_from_tuples("link_files/results_link_euc.csv", ["IdAuthor1", "IdAuthor2", "Score"], sortedl)
예제 #10
0
def process_emails_euclidean(dictio_of_users, dictio_of_emails):
    tic = time.time()
    num_users = len(dictio_of_users)
    num_emails = len(dictio_of_emails)
    matrix_map = np.memmap('email_files/email_matrix_map.dat',
                           dtype=np.uint8,
                           shape=(num_users, num_emails))
    lst_users = list(dictio_of_users.keys())
    #matrix_map = np.array(matrix_map)
    lst_res = []
    print(matrix_map.shape)
    print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_users * 4 /
                                               (1024**3)))
    status.create_numbar(100, num_users)
    #res = np.dot(matrix_map, matrix_map.T)
    res2 = np.memmap('email_files/email_euc_score.dat',
                     dtype=float,
                     mode='w+',
                     shape=(num_users, num_users))
    #res3 = np.memmap('email_files/email_dis_score.dat', dtype=np.uint32 ,mode ='w+', shape=(num_users, num_users))
    for i1 in range(num_users):
        status.update_numbar(i1, num_users)
        v1 = np.array(matrix_map[i1], dtype=np.int32)
        #v1p = gen_vector_for_user(list(dictio_of_users.keys())[i1], dictio_of_users, dictio_of_emails)

        #print(np.array_equal(v1, v1p))
        for i2 in range(i1 + 1, num_users):
            v2 = np.array(matrix_map[i2], dtype=np.int32)
            #print(v1.shape, v2.shape)
            #print(v1)
            euc_score = np.linalg.norm(v1 - v2)
            #dis_score = np.dot(v1, v2)
            lst_res.append((lst_users[i1], lst_users[i2], euc_score))
            res2[i1][i2] = euc_score
            #res3[i1][i2] = dis_score
            #res2[i2][i1] = score
    lst_res = sorted(lst_res, key=lambda x: x[2], reverse=False)
    gen_csv_from_tuples("email_files/results_email_euc_alt.csv",
                        ["IdAuthor1", "IdAuthor2", "Score"], lst_res)
    status.end_numbar()
    print("Flushing...")
    res2.flush()

    print("SECONDS: %f" % (time.time() - tic))
예제 #11
0
def join_euc_results_csv():
    global dictio_of_results
    limit = 1000000  # 1 million
    #dictio_of_results = {}
    for res in lst_res:
        tic = time.time()
        filename = res + "_files/new_results_" + res + "_euc.csv"
        # Check if file exists
        if not check_file_attr(filename):
            continue
        print("[-] Getting file length...")
        #filelen = file_len(filename)
        filelen = dictio_len[res]
        indi = 0
        status.create_numbar(100, filelen)
        with open(filename) as f:
            for line in csv.reader(f,
                                   delimiter=',',
                                   quotechar="'",
                                   quoting=csv.QUOTE_MINIMAL):
                indi += 1
                status.update_numbar(indi, filelen)
                if indi == 1:
                    #The first line of the csv are titles
                    continue
                x = tuple(line)
                i = (x[0], x[1], float(x[2]))

                #if (i[2] > limit):
                #If we surpass the limit we stop searching since it is ordered
                #break

                if not i[0] in dictio_of_results.keys():
                    dictio_of_results[i[0]] = {}

                if not i[1] in dictio_of_results[i[0]].keys():
                    dictio_of_results[i[0]][i[1]] = {}

                dictio_of_results[i[0]][i[1]][res + "_euc"] = i[2]

            status.end_numbar()
            print("[-] SECONDS: %f" % (time.time() - tic))
            print("[-] Length of dictionary: %d" % (len(dictio_of_results)))
예제 #12
0
def gen_bin_matrix_of_users(dictio_of_links, dictio_of_users):
	num_users = len(dictio_of_users)
	num_links = len(dictio_of_links)
	# Transform dictionary to indexes
	for indk, link in enumerate(dictio_of_links.keys()):
		dictio_of_links[link] = indk
	#Transform users to matrices.
	print("ESTIMATED SIZE OF MATRIX: %f GB" % (num_users * num_links * 1 / (1024 ** 3)))
	matrix_map = np.memmap('link_files/link_matrix_map.dat', dtype=np.uint8, mode ='w+', shape=(num_users, num_links))
	status.create_numbar(100, num_users)
	for ind, user in enumerate(dictio_of_users.keys()):
		status.update_numbar(ind, num_users)
		base = np.zeros((num_links,1), dtype=np.uint8)
		for link in dictio_of_users[user]:
			base[dictio_of_links[link]] = 1
		base = np.squeeze(base)
		matrix_map[ind] = base[:]
	status.end_numbar()
	print("Flushing...")
	matrix_map.flush()
예제 #13
0
    def join_all_results_alt(self):
        dictio_of_results = dict()
        tic = time.time()
        toc = time.time()
        for indi, tup in enumerate(self.list_files):
            filename, prefix = tup[0], tup[1]
            print("[-] Going for file: %d - %s" % (indi, filename))
            filelen = 102800000
            with open(filename, 'r') as f:
                indj = 0
                status.create_numbar(100, filelen)
                for line in csv.reader(f,
                                       delimiter=',',
                                       quotechar="'",
                                       quoting=csv.QUOTE_MINIMAL):
                    indj += 1
                    if indj == 1:
                        #The first line of the csv are titles
                        continue
                    entry = tuple(line)
                    #lst_results = read_csv_list(filename)[1:]
                    user1, user2 = self.order_users(entry)
                    status.update_numbar(indj, filelen)
                    if '-1' in user1 or '-1' in user2:
                        continue

                    if not user1 in dictio_of_results.keys():
                        dictio_of_results[user1] = dict()
                    if not user2 in dictio_of_results[user1].keys():
                        dictio_of_results[user1][user2] = dict()

                    dictio_of_results[user1][user2][prefix + "coin"] = float(
                        entry[2])
                    dictio_of_results[user1][user2][prefix + "uniq"] = float(
                        entry[3])

                status.end_numbar()
            #notify_mail("[+] Ended with file: %d - %s in %d seconds" % (indi, filename, time.time() - tic))
        return dictio_of_results
예제 #14
0
def link_value_removal_2(dictio_of_users, dictio_of_values):
	print("[+] Highlighting links to other forums...")
	dictio_of_sites = {
		"hackforums.net": 0,
		"mpgh.net": 4,
		"raidforums.com": 12,
		"antichat.ru": 10,
		"blackhatworld.com": 8,
		"garage4hackers.com": 7,
		"greysec.net": 6,
		"stresserforums.net": 5,
		"kernelmode.info": 1,
		"safeskyhacks.com": 13,
		"offensivecommunity.net": 3
	}
	link_list = []
	count_external_refs = 0
	status.create_numbar(100, len(dictio_of_values))
	for index, elem in enumerate(dictio_of_values.items()):
		status.update_numbar(index, len(dictio_of_values))
		link, users = elem[0], elem[1]
		for link_site, site_num in dictio_of_sites.items():
			if link_site in link:
				#print(link_site, link)
				ext_reference = False
				for user in users:
					user_site = int(user[len(user) - user[::-1].find('['):-1])
					if site_num != user_site:
						#print(users)
						ext_reference = True
						break
				if not ext_reference:
					link_list.append(link)
				break

	status.end_numbar()
	return link_list
예제 #15
0
def get_most_important():
    lst = read_csv_list("weighted_average.csv")[1:]
    dictio_lst = {}
    status.create_numbar(100, len(lst_res2))
    for indi, i in enumerate(lst_res2):
        status.update_numbar(indi, len(lst_res2))
        dictio_lst[i] = read_csv_list(i + "_files/user_to_" + i + ".csv")[1:]
    status.end_numbar()
    final_lst = []
    num = 100
    status.create_numbar(100, num)
    for indi, i in enumerate(lst[:num]):
        status.update_numbar(indi, num)
        userilist = [i[0], i[1], i[2]]
        for key in lst_res2:
            u1, u2 = find_user_list(i[0], i[1], dictio_lst[key])
            if u1 is None or u2 is None:
                continue
            userilist += list(u1.intersection(u2))
        final_lst.append(tuple(userilist))
    status.end_numbar()
    gen_csv_from_tuples("croos_val.csv",
                        ['user_a', 'user_b', 'metric', 'similar_vals'],
                        final_lst)
예제 #16
0
def clean_usernames():
	lst = read_csv_list("similar_usernames_full.csv")[1:]
	lst2 = read_csv_list("all_posts_all_users.csv")
	lst = [x for x in lst if x[0] != '']
	lst2 = [x for x in lst2 if x[0] != '']
	total = 0
	count = 0
	dictio = {}
	results = []
	threshold = 20
	for i in lst2:
		#print (i[0:2], i[2])
		dictio[i[0]] = {}
	for i in lst2:
		#print (i[0:2], i[2])
		dictio[i[0]][i[1]] = int(i[2])	
	
	not_both = [x for x in lst if x[0] not in dictio.keys()]
	print(len(dictio), len(list(set([x[0] for x in lst]))), len(not_both), len(not_both)+ len(dictio))

	for i in lst:
		boolean = True
		for j in i[1:]:
			if i[0] in dictio.keys() and dictio[i[0]][j] < threshold:
				boolean = False
		if boolean:
			#print (i)
			results += [i]
		else:
			del dictio[i[0]]
	for i in results:
		if (len (i) > 3):
			count += 1
		if (len (i) > 4):
			total += 1

	print ("At least two: %d" % (len(dictio)), "At least three: %d" % (count) , "At least four: %d" % (total))
	conn = psycopg2.connect(database="crimebb", user=db_username, password=db_password,  host="127.0.0.1", port="5432")
	print("Database Connected....")
	rows_processed = []
	
	status.create_numbar(100, len(dictio))

	for indi, i in enumerate(dictio.keys()):
		#print (multiprocessing.current_process(), "%0.2f %%" % ( indi * 100 / len(lst)))
		status.update_numbar(indi, len(dictio))
		for key in dictio[i].keys():
			cur = conn.cursor()
			cur.execute("""SELECT "Post"."Content"
			from "Post" JOIN "Member" ON "Post"."Author" = "Member"."IdMember"
			WHERE ("Member"."Username" = %s) AND "Member"."Site" = %s;""", (i, int(key)))
			rows = [row[0] for row in cur.fetchall()]
			#print (rows[0])
			tfidf = tf_idf(rows)
			tfidf = sorted(tfidf, key=lambda x: x[-1], reverse=True)
			#print(i, key, tfidf[:3])
			tfidf = [i for j in tfidf[:50] for i in j]
			dictio[i][key] = tuple(tfidf)
			#print (i[0], j, count)
		if indi == 100:
			k = list(dictio.keys())
			rows_processed = [(user, forum) + dictio[user][forum] for user in k[:100] for forum in dictio[user].keys()]
			gen_csv_from_tuples("tfidf_prov.csv", [""], rows_processed)

	status.end_numbar()
	rows_processed = [(user, forum) + dictio[user][forum] for user in dictio.keys() for forum in dictio[user].keys()]
	gen_csv_from_tuples("tfidf.csv", [""], rows_processed)
	conn.close()