def extract_user_to_link_csv(): query= """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)', 'g') AS "link" FROM "Post" WHERE "Content" ~ '(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\)+-Z^-_@.&+]|[!\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'), "B" AS (SELECT "Author", lower("link"[1]) as "link", count(*) as "repetitions" FROM "A" GROUP BY "Author", "link" ) SELECT "B"."Author", string_agg("B"."link", ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] # #print(rows) # for row in range(len(rows)): # #print(type(rows[row]), len(rows[row])) # for col in range(1, len(rows[row])): # #print(row, col, rows[row][col]) # if rows[row][col][-1] == '.': # #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) # print("CHANGED") # rows[row][col] = rows[row][col][:-1] # for row in range(len(rows)): # rows[row] = (rows[row][0],) + tuple(set(rows[row][1:])) print (len(rows)) gen_csv_from_tuples("link_files/user_to_link3.csv", ["IdAuthor", "link"], rows)
def extract_user_to_email_csv(): query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)', 'g') AS "email" FROM "Post" WHERE "Content" ~ '(?:(?![*]))([A-Za-z0-9\._%-\)\+]+@[A-Za-z0-9\.-]+[.][A-Za-z]+)'), "B" AS (SELECT "Author", lower("email"[1]) as "email", count(*) as "repetitions" FROM "A" GROUP BY "Author", "email" ) SELECT "B"."Author", string_agg("B"."email", ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) rows = [ list(row[:1] + tuple([x for x in row[1].split(", ")], )) for row in rows if row[0] != -1 ] #print(rows) for row in range(len(rows)): #print(type(rows[row]), len(rows[row])) for col in range(1, len(rows[row])): #print(row, col, rows[row][col]) if len(rows[row][col]) > len("***LINK***") and rows[row][ col][:len("***LINK***")] == "***LINK***": #print("Changed: %s by %s" % (rows[row][col], rows[row][col][10:])) rows[row][col] = rows[row][col][len("***LINK***"):] for row in range(len(rows)): rows[row] = (rows[row][0], ) + tuple(set(rows[row][1:])) #print(rows) #print(rows) print(len(rows)) print(len(rows)) gen_csv_from_tuples("email_files/user_to_email.csv", ["IdAuthor", "email"], rows)
def extract_user_to_ip_csv(): query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" ) SELECT "B"."Author", string_agg(CAST("B"."repetitions" AS text) || ':' || "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" query = """WITH "A" AS (SELECT CAST("Post"."Author" AS text) || '[' || CAST("Post"."Site" AS text) || ']' as "Author", regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "Author", "ip", count(*) as "repetitions" FROM "A" GROUP BY "Author", "ip" ) SELECT "B"."Author", string_agg("B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4], ', ') as "reps" FROM "B" GROUP BY "B"."Author";""" rows = make_query(query) #print(rows) print(len(rows)) rows = [ row[:1] + tuple([x for x in row[1].split(", ")], ) for row in rows if row[0] != -1 ] print(len(rows)) gen_csv_from_tuples("ip_files/user_to_ip.csv", ["IdAuthor", "IP"], rows)
def store_normalized_results(self, users, normalized_matrix, filename): head = ["IdAuthor1", "IdAuthor2" ] + [prefix for _, prefix in self.list_files] lst = [ tuple([pair[0], pair[1]] + [x for x in values]) for pair, values in zip(users, normalized_matrix) ] gen_csv_from_tuples(filename, head, lst)
def analyze_connected_components(self): G = self.generate_graph() for i, graph in enumerate(list(nx.connected_component_subgraphs(G))): num_nodes = graph.number_of_nodes() print("[-] Going for %d with %d" % (i, num_nodes)) if num_nodes > 7: graph_lst = [] for user, data in graph.nodes(data=True): graph_lst.append((user, graph.degree(user))) graph_lst = sorted(graph_lst, key=lambda x: x[1], reverse=True) gen_csv_from_tuples("graphs_info/%d-%d.csv" % (num_nodes, i), ["User", "#"], graph_lst)
def extract_ip_to_usage(): query = """WITH "A" AS (SELECT regexp_matches( "Content", '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', 'g') AS "ip" FROM "Post" WHERE "Content" ~ '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'), "B" AS (SELECT "ip", count(*) as "repetitions" FROM "A" GROUP BY "ip" ) SELECT "B"."ip"[1] || '.' ||"B"."ip"[2] || '.' ||"B"."ip"[3]|| '.' ||"B"."ip"[4] as "ip", "B"."repetitions" FROM "B";""" rows = make_query(query) #print(rows) print(len(rows)) #rows = [row[:1] + tuple([x for x in row[1].split(", ")],) for row in rows if row[0] != -1] print(len(rows)) gen_csv_from_tuples("ip_files/ip_count.csv", ["IP", "Reps"], rows)
def get_multfs(self, join_dict, matrix): num_pairs, num_features = matrix.shape tf_func = lambda x: (x.T / x.sum(axis=1)).T idf_func = lambda x: np.log(x.shape[0] / (x != 0).sum(axis=0)) tf_idf_func = lambda x: tf_func(x) * idf_func(x) idf_smooth_func = lambda x: np.log(x.shape[0] / ((x != 0).sum(axis=0) + 1)) + 1 idf_smooth_func2 = lambda x: np.log(1 + (x.shape[0] / ((x != 0).sum(axis=0)))) # Column normalization normalize = lambda matrix: (matrix - matrix.min(axis=0)) / (matrix.max( axis=0) - matrix.min(axis=0)) multfs_func1 = lambda x: normalize(tf_idf_func(x).sum(axis=1)) * 100 multfs_func2 = lambda x: normalize((x * idf_func(x)).sum(axis=1)) * 100 multfs_func3 = lambda x: normalize( (x * idf_smooth_func(x)).sum(axis=1)) * 100 multfs_func4 = lambda x: normalize( (x * idf_smooth_func2(x)).sum(axis=1)) * 100 #tf = (matrix.T / matrix.sum(axis = 1)).T #idf = np.log(num_pairs / (matrix != 0).sum(axis = 0)) #weights = (matrix != 0).sum(axis= 0) # Variation of tfidf #tf = weights / weights.sum() #idf = np.log(num_users / weights) #Inverse frequency of features. tfidf = tf_idf_func(matrix) multfs1 = multfs_func1(matrix) multfs2 = multfs_func2(matrix) multfs3 = multfs_func3(matrix) multfs4 = multfs_func4(matrix) lst_res = [] for i, (u1, u2) in enumerate(join_dict.keys()): if multfs3[i] > 40.0 or multfs1[i] > 40.0: lst_res.append( (u1, u2, multfs1[i], multfs2[i], multfs3[i], multfs4[i])) lst_res = sorted(lst_res, key=lambda x: x[4], reverse=True) gen_csv_from_tuples("multfs.csv", [ "User 1", "User 2", "MultFS Pure TFIDF", "MultFS IDF", "MultFS Smooth IDF 1", "MultFS Smooth IDF 2" ], lst_res) return lst_res
def get_information_from_matrix(self, user_ind, sparse_matrix_dot): tic = time.time() lst_res = [] inv_user_ind = {v: k for k, v in user_ind.items()} num_users = len(user_ind) #self.pprint("Transforming Matrix A", end='\r') #sparse_matrix_dot = sparse_matrix_dot.tocoo() #row, col, data = sparse_matrix_dot.row, sparse_matrix_dot.col, sparse_matrix_dot.data #self.pprint("[END] Transforming Matrix A") lst_res = [] tx = sparse_matrix_dot.shape[0] print(sparse_matrix_dot.shape) for uind in range(tx): if uind % 100 == 0: self.pprint("Info Extraction", "[%d Users Processed]" % (uind), "[%d List Length]" % (len(lst_res)), "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(), get_elapsed_time(tic), end='\r') row = np.array(sparse_matrix_dot[uind].toarray()) row = row.flatten() row[uind] = 0 # We do not consider the comparison with itself rmax = row.max() if rmax > 0: n = (row == rmax).sum() #max_inds = row.argsort()[-n:][::-1] max_uinds = np.argpartition( row, -n )[-n:] # it orders "-n" elements of the row, and then, it extracts the last n. for i in max_uinds: lst_res.append((inv_user_ind[uind], inv_user_ind[i], rmax)) lst_res = sorted(lst_res, key=lambda x: x[2], reverse=True) pickle_object(lst_res, self.dir + "results.pkl") self.pprint("[END] Info Extraction", "[%d Users Processed]" % (uind), "[%d List Length]" % (len(lst_res)), "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(), get_elapsed_time(tic)) gen_csv_from_tuples(self.dir + "results.csv", ["User1", "User2", "Relation Value"], lst_res) return lst_res