def louvain_community(): """ Detect communities using the Louvain algorithm. """ # Neo4j connection neo4j = neo4j_connect() # create in-memory graph with neo4j.session() as session: # check if graph already in memory graph_in_memory = session.run( "CALL gds.graph.exists('communityGraphSeeded') YIELD exists") # otherwise create it if not graph_in_memory.single()["exists"]: session.run( "CALL gds.graph.create.cypher('communityGraph', 'MATCH (n) RETURN id(n) AS id', 'MATCH (n)-[r:CO_OCCURRENCE]->(m) RETURN id(n) AS source, id(m) AS target')" ) # set community IDs session.run( "CALL gds.louvain.write('communityGraph', { writeProperty: 'communityId'})" ) # create seeded graph for future queries session.run( "CALL gds.graph.create.cypher('communityGraphSeeded', 'MATCH (n) RETURN id(n) AS id, n.communityId as communityId', 'MATCH (n)-[r:CO_OCCURRENCE]->(m) RETURN id(n) AS source, id(m) AS target')" ) # drop original graph session.run("CALL gds.graph.drop('communityGraph')") # set community IDs session.run( "CALL gds.louvain.write('communityGraphSeeded', { writeProperty: 'communityId', seedProperty: 'communityId'})" )
def betweenness_centrality(): """ Calculate betweenness centrality for ever node in network. """ # Neo4j connection neo4j = neo4j_connect() # create in-memory graph with neo4j.session() as session: # create in memory graph session.run( "CALL gds.graph.create.cypher('betweennessGraph', 'MATCH (n) RETURN id(n) AS id', 'MATCH (n)-[]->(m) RETURN id(n) AS source, id(m) AS target')" ) # calculate betweenness centrality r = session.run( "CALL gds.betweenness.write('betweennessGraph', { writeProperty: 'betweennessCentrality'}) YIELD minimumScore, maximumScore, createMillis, computeMillis, writeMillis" ) # drop graph session.run("CALL gds.graph.drop('betweennessGraph')")
world_map, named_entity_recognition, calculate_centrality, ) = parser() if __name__ == "__main__": # logging logging.basicConfig( filename=os.path.join(config("LOG_DIR"), "analytics.log"), format=config("LOG_FORMAT"), level=config("LOG_LEVEL"), datefmt=config("LOG_DATEFMT"), ) # Neo4j connection neo4j = neo4j_connect() ############################################################################################### # Statistics if statistics: with neo4j.session() as session: statistics = [] # get statistics result = session.run( "MATCH (n) RETURN COUNT(DISTINCT n) AS nodes_count") statistics.append( ["Nodes", next(result.__iter__())["nodes_count"]]) result = session.run(
def evaluate_information_gain(): """ Evaluate gain in information. """ # Neo4j connection neo4j = neo4j_connect() # get data with neo4j.session() as session: # get usernames connecting accounts connecting_usernames = session.run( "MATCH (k:UserAccount)-[:INCLUSION]-(l:Username)-[:INCLUSION]-(m:UserAccount) WHERE k.platform='vk' AND m.platform='twitter' RETURN l.username as username" ) connecting_usernames = [ x["username"] for x in connecting_usernames.__iter__() ] # save statistics connected_atoms_vk_total = { "Username": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], } connected_atoms_twitter_total = { "Username": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], } sigma = { "Username": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], "total": [], } for u in tqdm(connecting_usernames, desc="usernames", total=len(connecting_usernames), unit="records"): # get vk data query = ( "MATCH (k)-[:INCLUSION]-(l:UserAccount)-[:INCLUSION]-(m:Username) WHERE l.platform='vk' AND m.username='******' RETURN k.nodeId as nodeId, LABELS(k)[0] AS label") connecting_atoms_vk = session.run(query) data_vk = { "Username": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], } [ data_vk[x["label"]].append(x["nodeId"]) for x in connecting_atoms_vk.__iter__() ] # get twitter data query = ( "MATCH (k)-[:INCLUSION]-(l:UserAccount)-[:INCLUSION]-(m:Username) WHERE l.platform='twitter' AND m.username='******' RETURN k.nodeId as nodeId, LABELS(k)[0] AS label") connecting_atoms_twitter = session.run(query) data_twitter = { "Username": [], "Location": [], "Person": [], "Organization": [], "Phone": [], "Domain": [], } [ data_twitter[x["label"]].append(x["nodeId"]) for x in connecting_atoms_twitter.__iter__() ] # store statistics [ connected_atoms_vk_total[k].append(len(v)) for k, v in data_vk.items() ] [ connected_atoms_twitter_total[k].append(len(v)) for k, v in data_twitter.items() ] vk_atom_set_total = set() twitter_atom_set_total = set() for node_type in connected_atoms_vk_total.keys(): # intersection/union of atom sets vk_set = set(data_vk[node_type]) twitter_set = set(data_twitter[node_type]) intersection = vk_set.intersection(twitter_set) union = vk_set.union(twitter_set) # store vk_atom_set_total = vk_atom_set_total.union(vk_set) twitter_atom_set_total = twitter_atom_set_total.union(twitter_set) # node type specific sigma if len(union) > 0: sigma[node_type].append(len(intersection) / len(union)) intersection_total = vk_atom_set_total.intersection( twitter_atom_set_total) union_total = vk_atom_set_total.union(twitter_atom_set_total) sigma["total"].append(len(intersection_total) / len(union_total)) # store results results = [] for node_type in connected_atoms_vk_total.keys(): vk_mean = np.mean(connected_atoms_vk_total[node_type]) if len( connected_atoms_vk_total[node_type]) > 0 else 0 twitter_mean = (np.mean(connected_atoms_twitter_total[node_type]) if len(connected_atoms_twitter_total[node_type]) > 0 else 0) results.append([ node_type, np.round(vk_mean, 2), np.round(twitter_mean, 2), np.round(np.mean(sigma[node_type]), 2), ]) results.append( ["sigma total", np.round(np.mean(sigma["total"]), 2), "", ""]) table_plotter( data=results, filename="information_overlap_vk_twitter_user_accounts.png", title="Information overlap of VK and Twitter user accounts", param_plot={"figsize": (9, 12)}, param_ax={ "colLabels": ["", "$\\bf{VK}$", "$\\bf{Twitter}$", "$\sigma_e$"], "cellLoc": "left", "colLoc": "left", "loc": "upper center", "edges": "horizontal", "colWidths": [0.25, 0.25, 0.25, 0.25], }, )