def cluster_test(file_path): ''' community_multilevel and community_infomap cannot produce two clusters only community_leading_eigenvector and community_fastgreedy can produce two clusters label propagation with eigenvector has higher modularity than community_fastgreedy and community_leading_eigenvector alone Rand Index: 0.884, 0.974, 0.929 for communication network 0.807, 0.915, 0.864 for retweet network This methods is discarded ''' g = gt.Graph.Read_GraphML(file_path) gt.summary(g) # g = g.as_undirected(combine_edges=dict(weight="sum")) # g = gt.giant_component(g) # ---------treated as directed network seperations = [] # modularity = [] sizes = [] for i in xrange(100): # eigen = g.community_leading_eigenvector(clusters=2, weights='weight') # label_pro = g.community_label_propagation(weights='weight', initial=eigen.membership) print i com = g.community_infomap(edge_weights='weight', vertex_weights='weight') seperations.append(com.membership) # modularity.append(com.modularity) print len(com) sizes.append(len(com)) print '%.3f, %.3f, %.3f, %.3f' %(min(sizes), max(sizes), np.mean(sizes), np.std(sizes)) aRI = [] for i in xrange(100): for j in xrange(i+1, 100): aRI.append(metrics.adjusted_rand_score(seperations[i], seperations[j])) print len(aRI) # print '%.3f, %.3f, %.3f, %.3f' %(min(modularity), max(modularity), np.mean(modularity), np.std(modularity)) print '%.3f, %.3f, %.3f, %.3f' %(min(aRI), max(aRI), np.mean(aRI), np.std(aRI))
def test_significant(file_path): # random shuffle the weights of edges and test the segregate of networks g = gt.Graph.Read_GraphML(file_path) gt.summary(g) g = g.as_undirected(combine_edges=dict(weight="sum")) g = gt.giant_component(g) gt.summary(g) # print g.es['weight'] fast = g.community_fastgreedy(weights='weight') fast_com = fast.as_clustering(n=2) orig_mod = fast_com.modularity mod_list = [] for i in xrange(1000): weights = g.es["weight"] g.rewire() g.es["weight"] = weights # gt.net_stat(g) # print g.es['weight'] fast = g.community_fastgreedy(weights='weight') fast_com = fast.as_clustering() mod_list.append(fast_com.modularity) amean, astd = np.mean(mod_list), np.std(mod_list) print 'simulated values: %.3f +- (%.3f)' %(amean, astd) # absobserved = abs(raw_assort) # pval = (np.sum(ass_list >= absobserved) + # np.sum(ass_list <= -absobserved))/float(len(ass_list)) zscore = (orig_mod-amean)/astd print 'z-score: %.3f' %zscore
def network_pro_hashtags(): # Extract interaction networks from proed and pro-recoveryed hashtaged tweeets # Select only recovery users who have hashtags from ED hashtag topics # rec_tag_users = set(iot.get_values_one_field('fed', 'tag_com', 'id', {'rec_tageted': True})) # ped_tag_users = set(iot.get_values_one_field('fed', 'tag_com', 'id', {'ped_tageted': True})) # rec_tag_users = set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id')) # ped_tag_users = set(iot.get_values_one_field('fed', 'proed_tag', 'user.id')) # fedusers = iot.get_values_one_field('fed', 'com', 'id') fedusers = pickle.load(open('fed-user-id-str.pick', 'r')) print len(fedusers) users = [int(uid) for uid in fedusers] # only_ped = ped_tag_users - rec_tag_users # only_rec = rec_tag_users - ped_tag_users # all_users = list(rec_tag_users.union(ped_tag_users)) for btype in ['communication']: # gb = gt.load_beh_network('fed', 'bnet_ed_tag', btype) gb = gt.load_beh_network_subset(users, 'fed', 'bnet_ed_tag', btype) # for v in gb.vs: # if int(v['name']) in only_ped: # v['set'] = -1 # elif int(v['name']) in only_rec: # v['set'] = 1 # else: # v['set'] = 0 gt.summary(gb) gb.write_graphml(btype+'-only-fed.graphml')
def network_change(dbname, comname, netname): # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}} # users = iot.get_values_one_field(dbname, comname, 'id', filter) # g1 = gt.load_network_subset(users, dbname, netname, {'scraped_times': 2}) # g2 = gt.load_network_subset(users, dbname, netname, {'scraped_times': 131}) # pickle.dump(g1, open('data/g1.pick', 'w')) # pickle.dump(g2, open('data/g2.pick', 'w')) g1 = pickle.load(open('data/g1.pick', 'r')) g2 = pickle.load(open('data/g2.pick', 'r')) # g1 = gt.load_network_subset(dbname, 'net', {'scraped_times': 2}) # g2 = gt.load_network_subset(dbname, 'net', {'scraped_times': 131}) gt.summary(g1) gt.summary(g1) gt.net_stat(g1) gt.net_stat(g2) # pt.pdf_plot_one_data(g1.indegree(), 'indegree', linear_bins=False, fit_start=1, fit_end=100) pt.plot_pdf_mul_data( [np.array(g1.indegree()) + 1, np.array(g2.indegree()) + 1], 'indegree', ['b', 'r'], ['o', '^'], ['G1', 'G2'], linear_bins=False, central=False, fit=True, savefile='indegree.pdf')
def test_user_cluster_assign_stable(): # Test stable how final user clustering assignments (k=2) core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML( 'communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) # user_hashtag_vector('fed', 'ed_tag', users) seperations = [] for i in xrange(100): print i user_hashtag_profile(core, users) # data += user_cluster_hashtag() cluters, ids = user_cluster_hashtag() seperations.append(cluters) aRI = [] for i in xrange(100): for j in xrange(i + 1, 100): aRI.append( metrics.adjusted_rand_score(seperations[i], seperations[j])) print len(aRI) print '%.3f, %.3f, %.3f, %.3f' % (min(aRI), max(aRI), np.mean(aRI), np.std(aRI))
def pmi(g, filename=None): ''' Calculate the PMI weight for edges :param g: :param filename: :return: ''' # print g.is_loop() vw_sum = sum(g.vs["weight"]) for edge in g.es: source_vertex_id = edge.source target_vertex_id = edge.target source_vertex = g.vs[source_vertex_id] target_vertex = g.vs[target_vertex_id] ew = edge['weight'] edge['pmi'] = np.log2( float(ew * vw_sum) / (source_vertex['weight'] * target_vertex['weight'])) # pickle.dump(g, open('data/'+filename+'_pmi_tag.pick', 'w')) # g = pickle.load(open('data/'+filename+'_pmi_tag.pick', 'r')) # pdf(g.es['weight']) # plot_graph(g, 'ed-hashtag') gt.summary(g) g = g.subgraph_edges(g.es.select(pmi_gt=0)) gt.summary(g) g.write_graphml(filename + '_pmi.graphml') # g.es['weight'] = g.es['pmi'] return g
def z_scores(filename): # Test the significance betweet the links of two nodes g = gt.Graph.Read_GraphML(filename + '.graphml') gt.summary(g) ds = g.vs["weight"] dsum = sum(ds) if dsum % 2: g.vs[0]['weight'] += 1 ds = g.vs["weight"] # distrition = {} # for i in xrange(1000): # print i # rg = gt.Graph.Degree_Sequence(ds) # rg.es['weight'] = 1 # rg.vs['name'] = rg.degree() # rg.simplify(combine_edges=sum) # for edge in rg.es: # source_vertex_id = edge.source # target_vertex_id = edge.target # source_vertex_name = rg.vs[source_vertex_id]['name'] # target_vertex_name = rg.vs[target_vertex_id]['name'] # ew = edge['weight'] # if source_vertex_name < target_vertex_name: # key = (source_vertex_name, target_vertex_name) # else: # key = (target_vertex_name, source_vertex_name) # # dis = distrition.get(key, []) # dis.append(ew) # distrition[key] = dis # pickle.dump(distrition, open('dis-all.pick', 'w')) distrition = pickle.load(open('dis-all.pick', 'r')) for edge in g.es: source_vertex_id = edge.source target_vertex_id = edge.target source_vertex_name = g.vs[source_vertex_id]['weight'] target_vertex_name = g.vs[target_vertex_id]['weight'] if source_vertex_name < target_vertex_name: key = (source_vertex_name, target_vertex_name) else: key = (target_vertex_name, source_vertex_name) dis = distrition.get(key) dm = np.mean(dis) dst = np.std(dis) var = (edge['weight'] - dm) if dst == 0 and var == 0: zscore = 0 else: zscore = var / dst edge['rWeight'] = zscore if zscore < 0 or zscore > 1.96: print g.vs[source_vertex_id]['name'], g.vs[target_vertex_id][ 'name'], key, edge['weight'], dm, dst, zscore g.write_graphml(filename + '_zscore.graphml')
def recover_proed_interaction(): # interaction network of pro-recovery and pro-ed users prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') btype_dic = {'retweet': [1], 'reply': [2], 'mention': [3], 'communication': [2, 3]} for btype in ['retweet', 'reply', 'mention']: cols = dbt.db_connect_col('fed', 'sbnet') name_map, edges, set_map = {}, {}, {} for row in cols.find({'type': {'$in': btype_dic[btype]}}, no_cursor_timeout=True): n1 = str(row['id0']) n2 = str(row['id1']) if n1 in prorec or n1 in proed: if n1 != n2: n1id = name_map.get(n1, len(name_map)) name_map[n1] = n1id n2id = name_map.get(n2, len(name_map)) name_map[n2] = n2id wt = edges.get((n1id, n2id), 0) edges[(n1id, n2id)] = wt + 1 g = Graph(len(name_map), directed=True) g.vs["name"] = list(sorted(name_map, key=name_map.get)) g.add_edges(edges.keys()) g.es["weight"] = edges.values() g.vs["set"] = 0 for v in g.vs: if v['name'] in prorec: v['set'] = 1 elif v['name'] in proed: v['set'] = -1 gt.summary(g) edges = g.es.select(weight_gt=3) edge_nodes = [] for edge in edges: source_vertex_id = edge.source target_vertex_id = edge.target source_vertex = g.vs[source_vertex_id] target_vertex = g.vs[target_vertex_id] edge_nodes.append(source_vertex['name']) edge_nodes.append(target_vertex['name']) nodes = [] for v in g.vs: if v['set'] == 1 or v['set'] == -1: nodes.append(v) elif v['name'] in edge_nodes: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('rec-proed-'+btype+'.graphml')
def test_stable_infomap_kmean(): # Test the stable for the whole process, from infomap clustering hashtag and k-means clustering users import tag_network core = gt.Graph.Read_GraphML('alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) tag_network.user_hashtag_profile(core, users)
def diversity_db(dbname, comname, behavior, netname): userlist = iot.get_values_one_field(dbname, comname, 'id', # {'timeline_count': {'$gt': 0}} ) g = gt.load_beh_network_subset(userlist, dbname, netname, behavior) gt.summary(g) # g = bahavior_net(dbname, comname, netname, behavior) # pickle.dump(g, open('data/'+dbname+'_'+behavior+'.pick', 'w')) print dbname, behavior # g = pickle.load(open('data/' + dbname + '_' + behavior + '.pick', 'r')) return netstatis(dbname, behavior, g, [str(i) for i in userlist], comname)
def output_net_user_data(dbname, comname, netname): ''' Output the social network (two-ground) and user's ED states into local files ''' g = gt.load_network(dbname, netname) gt.summary(g) com = dbt.db_connect_col(dbname, comname) for v in g.vs: user = com.find_one({'id': int(v['name'])}) v['l'] = user['level'] v['ed'] = profiles_check.check_ed(user) g.write_graphml(dbname+'-'+netname+'.graphml')
def fed_all_tag_topic(filepath='data/fed_tag_undir.graphml'): # get topics of all hashtags posted by fed users # The results before are obatain using more than 3 tweets and 3 users # Then use the giant component. g = gt.Graph.Read_GraphML(filepath) gt.summary(g) vs = g.vs(weight_gt=10, user_gt=10) g = g.subgraph(vs) gt.summary(g) # g = gt.giant_component(g) com = g.community_infomap(edge_weights='weight', vertex_weights='weight') comclus = com.subgraphs() print len(comclus) pickle.dump(comclus, open('data/fed_tag_undir.communities'))
def users_with_collected_friends(dbname, comname, netname): # get network from random and younger datasets users = iot.get_values_one_field(dbname, comname, 'id', {'level':1}) # net = gt.load_network_subset(dbname, netname, { # 'user': {'$in': users}, 'follower': {'$in': users} # }) # net.write_graphml(dbname+'-net.graphml') g = gt.Graph.Read_GraphML(dbname+'-net.graphml') gt.summary(g) g.vs['outk'] = g.indegree() nodes = g.vs.select(outk_gt=0) print len(nodes) user_ids = [int(v['name']) for v in nodes] print len(set(users).intersection(set(user_ids)))
def profile_cluster(filepath): # Clustering user based on word2vec of user profiles g = gt.Graph.Read_GraphML(filepath) gt.summary(g) g = g.as_undirected(combine_edges=dict(weight="sum")) components = g.clusters() g = components.giant() gt.summary(g) com = dbt.db_connect_col('fed', 'com') data = {} for uid in g.vs['name']: user = com.find_one({'id': int(uid)}, ['description']) profile = user['description'] if profile: tokens = pc.tokenizer_stoprm(profile) data[uid] = tokens import gensim # dictionary = gensim.corpora.Dictionary(data.values()) # dictionary.save('lda.dict') # corpus = [dictionary.doc2bow(text) for text in data.values()] # lda = gensim.LdaModel(corpus, num_topics=100, id2word=dictionary) word2vec = gensim.models.Word2Vec(data.values(), size=300, sg=1) X, y = [], [] for node in g.vs: k = node['name'] v = data[k] vect = np.zeros(300) count = 0 for word in v: if word in word2vec: vect += word2vec[word] count += 1 X.append(vect/count) y.append(k) X = np.asarray(X) print X.shape print X matrix = g.get_adjacency() # clustering = AgglomerativeClustering(connectivity=matrix._get_data()) clustering = AgglomerativeClustering() clustering.fit(X) members = clustering.labels_ comm = gt.VertexClustering(g, membership=members) layout = g.layout("fr") gt.plot(comm, layout=layout, vertex_size=5)
def recover_proed_community(): # pro-recovery and pro-ed users, and their outlinked communities prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') cols = dbt.db_connect_col('fed', 'follownet') name_map, edges, set_map = {}, set(), {} for row in cols.find({},no_cursor_timeout=True): n1 = str(row['follower']) if n1 in prorec or n1 in proed: n2 = str(row['user']) n1id = name_map.get(n1, len(name_map)) name_map[n1] = n1id n2id = name_map.get(n2, len(name_map)) name_map[n2] = n2id edges.add((n1id, n2id)) g = Graph(len(name_map), directed=True) g.vs["name"] = list(sorted(name_map, key=name_map.get)) # return keys ordered by values g.add_edges(list(edges)) g.es["weight"] = 1 g.vs["set"] = 0 for v in g.vs: if v['name'] in prorec: v['set'] = 1 elif v['name'] in proed: v['set'] = -1 gt.summary(g) g.vs['deg'] = g.indegree() nodes = [] for v in g.vs: if v['set'] == 1 or v['set'] == -1: nodes.append(v) elif v['deg'] > 3: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('rec-proed-follow.graphml') # sbnet have extended all interactions posted by ED users edusers = set(g.vs['name']) for btype in ['retweet', 'reply', 'mention']: gb = gt.load_beh_network('fed', 'sbnet', btype) gt.summary(gb) nodes = [] for v in gb.vs: if v['name'] in edusers: nodes.append(v) gb = gb.subgraph(nodes) for v in gb.vs: v['set'] = g.vs.find(name=v['name'])['set'] gt.summary(gb) gb.write_graphml('rec-proed-'+btype+'.graphml')
def friendship_community(dbname, colname, label): # fg = gt.load_network(dbname, colname) # gt.summary(fg) # pickle.dump(fg, open('data/'+label+'-fg.pick', 'w')) fg = pickle.load(open('data/'+label+'-fg.pick', 'r')) # fgc = gt.giant_component(fg, 'WEAK') # gt.summary(fgc) # pickle.dump(fgc, open('data/'+label+'-fgc.pick', 'w')) # fcoms = gt.fast_community(fg) # pickle.dump(fcoms, open('data/'+label+'-fcom.pick', 'w')) fcoms = pickle.load(open('data/'+label+'-fcom.pick', 'r')) # gt.plot(fcoms, 'friend_comms_den.pdf', bbox=(1200, 900)) fclus = fcoms.as_clustering() gt.summary(fclus) print fclus.recalculate_modularity() community_topic(fg, fclus, dbname, 'scom', 'stimeline')
def friendship_community(dbname, colname, label): # fg = gt.load_network(dbname, colname) # gt.summary(fg) # pickle.dump(fg, open('data/'+label+'-fg.pick', 'w')) fg = pickle.load(open('data/' + label + '-fg.pick', 'r')) # fgc = gt.giant_component(fg, 'WEAK') # gt.summary(fgc) # pickle.dump(fgc, open('data/'+label+'-fgc.pick', 'w')) # fcoms = gt.fast_community(fg) # pickle.dump(fcoms, open('data/'+label+'-fcom.pick', 'w')) fcoms = pickle.load(open('data/' + label + '-fcom.pick', 'r')) # gt.plot(fcoms, 'friend_comms_den.pdf', bbox=(1200, 900)) fclus = fcoms.as_clustering() gt.summary(fclus) print fclus.recalculate_modularity() community_topic(fg, fclus, dbname, 'scom', 'stimeline')
def test_user_cluster_stable(): # Test stable of using infomap and test best k for k-means core = gt.Graph.Read_GraphML('data/alled_tag_undir_filter.graphml') communication = gt.Graph.Read_GraphML( 'data/communication-only-fed-filter.graphml') gt.summary(communication) communication = gt.giant_component(communication) gt.summary(communication) users = [(v['name']) for v in communication.vs] print len(users) # user_hashtag_vector('fed', 'ed_tag', users) data = [] for i in xrange(100): user_hashtag_profile(core, users, i) ###### Run by python data += user_cluster_hashtag() df = pd.DataFrame(data, columns=['cluster', 'silhouette_avg']) df.to_csv('user-kmeans-hashtag.csv')
def communtiy_feature(dbname, typename): fg = ntt.loadnet(dbname, typename) fcoms = gt.fast_community(fg) pickle.dump(fcoms, open('data/'+dbname+typename+'com.pick', 'w')) fcoms = pickle.load(open('data/'+dbname+typename+'com.pick', 'r')) fclus = fcoms.as_clustering() gt.summary(fclus) """Compare difference of features in cummunities""" features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] therh = 0.1 * fg.vcount() for feature in features: data = [] for clu in fclus: if len(clu) > therh: ulist = set() for v in clu: ulist.add(int(fg.vs[v]['name'])) ulist = list(ulist) clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': {'$in': ulist}}) data.append(clu_values) plot.plot_config() for i in xrange(len(data)): sns.distplot(data[i], hist=False, label=str(i)+':'+str(len(data[i]))) plt.xlabel(feature) plt.ylabel('PDF') # plt.show() plt.savefig(feature+typename+'_com.pdf') plt.clf()
def cluseter_nodes(btype = 'communication'): # cluster users in networks g = gt.Graph.Read_GraphML('communication-only-fed-filter.graphml') g = gt.giant_component(g) gt.summary(g) cluters, ids = tn.user_cluster_hashtag('ed-'+btype+'.data') # ids = [] # with open('ed-'+btype+'.data', 'r') as fo: # for line in fo.readlines(): # ids.append(line.split(' ')[0]) g.vs['cluster'] = -1 for i in xrange(len(cluters)): id = ids[i] v = g.vs.find(name=id) v['cluster'] = cluters[i] g.write_graphml('communication-only-fed-filter-hashtag-cluster.graphml')
def count_existing_user(btype=''): # count how many fed users in network g = gt.Graph.Read_GraphML('pro-'+btype+'-hashtag.graphml') gt.summary(g) # users = iot.get_values_one_field('fed', 'com', 'id_str') # pickle.dump(users, open('fed-user-id.pick', 'w')) users = set(pickle.load(open('fed-user-id.pick', 'r'))) print len(users) # nodes = g.vs.select(name_in=users) nodes = [] count = 0 for v in g.vs: if v['name'] in users: count += 1 nodes.append(v) print float(count)/len(g.vs) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('pro-'+btype+'-hashtag-fed.graphml')
def network_change(dbname, comname, netname): # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}} # users = iot.get_values_one_field(dbname, comname, 'id', filter) # g1 = gt.load_network_subset(users, dbname, netname, {'scraped_times': 2}) # g2 = gt.load_network_subset(users, dbname, netname, {'scraped_times': 131}) # pickle.dump(g1, open('data/g1.pick', 'w')) # pickle.dump(g2, open('data/g2.pick', 'w')) g1 = pickle.load(open('data/g1.pick', 'r')) g2 = pickle.load(open('data/g2.pick', 'r')) # g1 = gt.load_network_subset(dbname, 'net', {'scraped_times': 2}) # g2 = gt.load_network_subset(dbname, 'net', {'scraped_times': 131}) gt.summary(g1) gt.summary(g1) gt.net_stat(g1) gt.net_stat(g2) # pt.pdf_plot_one_data(g1.indegree(), 'indegree', linear_bins=False, fit_start=1, fit_end=100) pt.plot_pdf_mul_data([np.array(g1.indegree())+1, np.array(g2.indegree())+1], 'indegree', ['b', 'r'], ['o', '^'], ['G1', 'G2'], linear_bins=False, central=False, fit=True, savefile='indegree.pdf')
def communtiy_feature(dbname, typename): fg = ntt.loadnet(dbname, typename) fcoms = gt.fast_community(fg) pickle.dump(fcoms, open('data/' + dbname + typename + 'com.pick', 'w')) fcoms = pickle.load(open('data/' + dbname + typename + 'com.pick', 'r')) fclus = fcoms.as_clustering() gt.summary(fclus) """Compare difference of features in cummunities""" features = [ 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] therh = 0.1 * fg.vcount() for feature in features: data = [] for clu in fclus: if len(clu) > therh: ulist = set() for v in clu: ulist.add(int(fg.vs[v]['name'])) ulist = list(ulist) clu_values = iot.get_values_one_field(dbname, 'com', feature, {'id': { '$in': ulist }}) data.append(clu_values) plot.plot_config() for i in xrange(len(data)): sns.distplot(data[i], hist=False, label=str(i) + ':' + str(len(data[i]))) plt.xlabel(feature) plt.ylabel('PDF') # plt.show() plt.savefig(feature + typename + '_com.pdf') plt.clf()
def two_community(file_path): # get two community from networks g = gt.Graph.Read_GraphML(file_path) gt.summary(g) # g = g.as_undirected(combine_edges=dict(weight="sum")) g = gt.giant_component(g) gt.summary(g) # ml = g.community_multilevel(weights='weight', return_levels=True) # fast = g.community_fastgreedy(weights='weight') # fast_com = fast.as_clustering(n=2) # walk = g.community_walktrap(weights='weight') # walk_com = walk.as_clustering(n=2) infor = g.community_infomap(edge_weights='weight', vertex_weights=None, trials=2) # eigen = g.community_leading_eigenvector(clusters=2, weights='weight') # label_pro = g.community_label_propagation(weights='weight', initial=eigen.membership) # betweet = g.community_edge_betweenness(weights='weight') # bet_com = betweet.as_clustering(n=2) g.vs['community'] = infor.membership g.write_graphml('com-'+file_path) return infor.subgraphs()
def tags_two_user_moduls(): #load network from gephi output g = gt.Graph.Read_GraphML('communication-3-moduls.graphml') cluster0, cluster1, cluster2 = set(), set(), set() for v in g.vs: if v['Modularity Class'] == 0: cluster0.add(int(v['name'])) elif v['Modularity Class'] == 1: cluster1.add(int(v['name'])) elif v['Modularity Class'] == 2: cluster2.add(int(v['name'])) g = gt.load_hashtag_coocurrent_network_undir('fed', 'ed_tag', list(cluster0)) gt.summary(g) filename = 'communication_fed_cluster0' g.write_graphml(filename + '_tag_undir.graphml') g = gt.load_hashtag_coocurrent_network_undir('fed', 'ed_tag', list(cluster1)) gt.summary(g) filename = 'communication__fed_cluster1' g.write_graphml(filename + '_tag_undir.graphml') g = gt.load_hashtag_coocurrent_network_undir('fed', 'ed_tag', list(cluster2)) gt.summary(g) filename = 'communication_fed_cluster2' g.write_graphml(filename + '_tag_undir.graphml')
def ed_follow_net(): # construct ED and their followee network g = gt.load_network('fed', 'follownet') g.vs['deg'] = g.indegree() users = set(iot.get_values_one_field('fed', 'scom', 'id')) nodes = [] for v in g.vs: if int(v['name']) in users: nodes.append(v) elif v['deg'] > 5: nodes.append(v) else: pass print 'Filtered nodes: %d' %len(nodes) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('ed-friend'+'.graphml') # sbnet have extended all interactions posted by ED users edusers = set(g.vs['name']) for btype in ['retweet', 'reply', 'mention']: gb = gt.load_beh_network('fed', 'sbnet', btype) gt.summary(gb) nodes = [] for v in gb.vs: if v['name'] in edusers: nodes.append(v) gb = gb.subgraph(nodes) gt.summary(gb) gb.write_graphml('ed-'+btype+'-follow.graphml')
def user_statis(): groups = [ ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}) ] data = [] for tag, dbname, comname, filter_values in groups: com = dbt.db_connect_col(dbname, comname) network1 = gt.Graph.Read_GraphML(tag.lower()+'-net.graphml') gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) users_time = iot.get_values_one_field(dbname, comname, 'id_str', filter_values) try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends = set(network1.successors(str(uid)))
def tags_user_cluster(graph_file_path, filename): # put tweet of two cluster into two set g = gt.Graph.Read_GraphML(graph_file_path) # g_mention = gt.Graph.Read_GraphML('ed-communication'+'-hashtag-only-fed-cluster.graphml') gt.summary(g) # gt.summary(g_mention) # for i in range(2): # g = [g_retweet, g_mention][i] cluster0, cluster1, cluster2 = set(), set(), set() for v in g.vs: if v['cluster'] == 0: cluster0.add(int(v['name'])) elif v['cluster'] == 1: cluster1.add(int(v['name'])) elif v['cluster'] == -1: cluster2.add(int(v['name'])) print 'cluster size;', len(cluster0) g = gt.load_hashtag_coocurrent_network_undir('fed', 'ed_tag', list(cluster0)) gt.summary(g) # filename = ['ed_retweet', 'ed_communication'][i] + '_fed_cluster0' vs = g.vs(weight_gt=3, user_gt=3) g = g.subgraph(vs) gt.summary(g) g.write_graphml(filename + 'tag_undir_cluster0.graphml') print 'cluster size;', len(cluster1) g = gt.load_hashtag_coocurrent_network_undir('fed', 'ed_tag', list(cluster1)) gt.summary(g) # filename = ['ed_retweet', 'ed_communication'][i] + '_fed_cluster1' vs = g.vs(weight_gt=3, user_gt=3) g = g.subgraph(vs) gt.summary(g) g.write_graphml(filename + 'tag_undir_cluster1.graphml')
def behavior_community(dbname, colname, label): # targed_list = set() # db = dbt.db_connect_no_auth('fed') # poi = db['com'] # for user in poi.find({}, ['id']): # targed_list.add(user['id']) # bg = gt.load_beh_network(dbname, colname) # gt.summary(bg) # pickle.dump(bg, open('data/'+label+'-bg.pick', 'w')) bg = pickle.load(open('data/' + label + '-bg.pick', 'r')) # bgc = gt.giant_component(bg, 'WEAK') # gt.summary(bgc) # pickle.dump(bgc, open('data/'+label+'-bgc.pick', 'w')) # bcoms = gt.fast_community(bg) # pickle.dump(bcoms, open('data/'+label+'-bcom.pick', 'w')) bcoms = pickle.load(open('data/' + label + '-bcom.pick', 'r')) # gt.plot(bcoms, 'commu_comms_den.pdf', bbox=(1200, 900)) bclus = bcoms.as_clustering() gt.summary(bclus) print bclus.recalculate_modularity() community_topic(bg, bclus, dbname, 'scom', 'stimeline')
def ed_follow_community(file_path): # inspect keywords of user profiles in different communities g = gt.Graph.Read_GraphML(file_path) gt.summary(g) g = g.as_undirected(combine_edges=dict(weight="sum")) components = g.clusters() g = components.giant() gt.summary(g) com = dbt.db_connect_col('fed', 'com') ml = g.community_fastgreedy(weights='weight').as_clustering() # ml = g.community_multilevel(weights='weight') common_words = [] fdist_all = FreqDist() for cluster in ml: print len(cluster) fdist = FreqDist() for uid in cluster: user = com.find_one({'id': int(g.vs[uid]['name'])}, ['description']) profile = user['description'] if profile: # text = ' '.join(pc.tokenizer_stoprm(profile)) tokens = pc.tokenizer_stoprm(profile) for word in tokens: fdist[word] += 1 fdist_all[word] += 1 common_words.append(fdist) for fd in common_words: w_tfidf = [] # print fd.most_common(20) for (word, freq) in fd.most_common(20): allfreq = fdist_all[word] # print word, freq, allfreq w_tfidf.append((word, float(freq)/allfreq)) sortedlist = sorted(w_tfidf, key=lambda x: x[1], reverse=True) print sortedlist
def behavior_community(dbname, colname, label): # targed_list = set() # db = dbt.db_connect_no_auth('fed') # poi = db['com'] # for user in poi.find({}, ['id']): # targed_list.add(user['id']) # bg = gt.load_beh_network(dbname, colname) # gt.summary(bg) # pickle.dump(bg, open('data/'+label+'-bg.pick', 'w')) bg = pickle.load(open('data/'+label+'-bg.pick', 'r')) # bgc = gt.giant_component(bg, 'WEAK') # gt.summary(bgc) # pickle.dump(bgc, open('data/'+label+'-bgc.pick', 'w')) # bcoms = gt.fast_community(bg) # pickle.dump(bcoms, open('data/'+label+'-bcom.pick', 'w')) bcoms = pickle.load(open('data/'+label+'-bcom.pick', 'r')) # gt.plot(bcoms, 'commu_comms_den.pdf', bbox=(1200, 900)) bclus = bcoms.as_clustering() gt.summary(bclus) print bclus.recalculate_modularity() community_topic(bg, bclus, dbname, 'scom', 'stimeline')
def community(g=None): ''' Detect communities in the co-occurrence network of hashtag Use multilevel to detect communities Only select communities whose sizes are larger than a threshold :param g: :return: hash_com: {hashtag: community_index} com_size: {community_index: community_size} ''' gt.summary(g) vs = g.vs(weight_gt=100, user_gt=10) g = g.subgraph(vs) g = g.subgraph_edges(g.es.select(rWeight_gt=0, rWeight_lt=float('Inf'))) gt.summary(g) gc = gt.giant_component(g) gt.summary(gc) # g.write_graphml('fed_tag_undir_over3.graphml') # com = g.community_multilevel(weights='rWeight', return_levels=False) com = g.community_infomap(edge_weights='rWeight', vertex_weights=None) # com = louvain.find_partition(gc, method='Significance', weight=None) comclus = com.subgraphs() print 'Community stats: #communities, modularity', len( comclus), com.modularity index = 0 nonsingle = 0 hash_com = {} com_size = {} for comclu in comclus: print '---------- Community ', index, '-----------------' if comclu.vcount() > 1: nonsingle += 1 tag_weight = {} for v in comclu.vs: if v['weight'] > 5: hash_com[v['name']] = index tag_weight[v['name']] = v['weight'] count = com_size.get(index, 0) com_size[index] = v['weight'] + count sort_list = list(sorted(tag_weight, key=tag_weight.get, reverse=True)) for key in sort_list[:min(len(sort_list), len(sort_list))]: print key, tag_weight[key] print '-------------Community size: ', com_size[ index], '---------------------' print index += 1 # print len(hash_com) # print len(set(hash_com.values())) # print set(hash_com.values()) print '------------------all size:', sum( com_size.values()), '---------------------' print '------------------non single clusters:', nonsingle, '---------------------' return hash_com, com_size
def tfidf_tag_cluster(btype='retweet'): # Calculate the TFIDF of tags in two clusters cluster0 = gt.Graph.Read_GraphML('ed_' + btype + '_fed_cluster0_tag_undir.graphml') cluster1 = gt.Graph.Read_GraphML('ed_' + btype + '_fed_cluster1_tag_undir.graphml') gt.summary(cluster0) vs = cluster0.vs(weight_gt=3, user_gt=3) cluster0 = cluster0.subgraph(vs) cluster0 = gt.giant_component(cluster0) gt.summary(cluster0) gt.summary(cluster1) vs = cluster1.vs(weight_gt=3, user_gt=3) cluster1 = cluster1.subgraph(vs) cluster1 = gt.giant_component(cluster1) gt.summary(cluster1) for v in cluster0.vs: exist = True count_ov = 0.0 try: ov = cluster1.vs.find(name=v['name']) except ValueError: exist = False if exist: count_ov = ov['weight'] v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov) for v in cluster1.vs: exist = True count_ov = 0.0 try: ov = cluster0.vs.find(name=v['name']) except ValueError: exist = False if exist: count_ov = ov['weight'] v['tfidf'] = float(v['weight']) / (v['weight'] + count_ov) cluster0.write_graphml('ed_' + btype + '_fed_cluster0_tfidf_tag_undir.graphml') cluster1.write_graphml('ed_' + btype + '_fed_cluster1_tfidf_tag_undir.graphml')
def remove_spam(btype): # remove nodes that have too much outdegree but few indegree g = gt.Graph.Read_GraphML('ed-'+btype+'-hashtag.graphml') gt.summary(g) g.vs['ratio'] = (np.array(g.outdegree())+1)/(np.array(g.indegree())+1) gt.summary(g) maxv = np.percentile(g.vs['ratio'], 97.5) print maxv nodes = g.vs.select(ratio_lt=maxv) g = g.subgraph(nodes) gt.summary(g) g.write_graphml('ed-'+btype+'-hashtag-rmspam.graphml')
def friend_dis(dbname, comname, netname, tagets): #he returned list from Graph.neighbors always includes the input vertex, # while those from predecessors and successors don’t. # So the size of returned list from neighbors is always larger # 1 than those from other two methods. db = dbt.db_connect_no_auth(dbname) com = db[comname] g = gt.load_network(dbname, netname) gt.add_attributes(g, ['followers_count', 'friends_count'], dbname, comname, ['followers_count', 'friends_count']) gt.summary(g) for user in com.find({}, ['id', 'net_anal']): uid = user['id'] values = user.get('net_anal', {'mined': True}) node_exist = True try: v = g.vs.find(name=str(uid)) except ValueError: node_exist = False if node_exist: # followers = g.neighborhood(str(uid), mode='out') # followings = g.neighborhood(str(uid), mode='in') followers = g.successors(str(uid)) followings = g.predecessors(str(uid)) # print followers # print followings follower_set = set(int(name) for name in g.vs[followers]['name']) following_set = set(int(name) for name in g.vs[followings]['name']) ed_follower = len(tagets & follower_set) ed_following = len(tagets & following_set) # friend_set = follower_set | following_set # print follower_set # print following_set follower = v['followers_count'] if follower == 0: follower = 1 following = v['friends_count'] if following == 0: following = 1 # friend = len(friend_set) # if friend == 0: # friend = 1 # ed_friend = len(tagets & friend_set) ed_follower_p = float(ed_follower)/follower ed_following_p = float(ed_following)/following # ed_friend_p = float(ed_friend)/friend net_sta = {} # net_sta['follower_no'] = follower # net_sta['following_no'] = following # net_sta['friend_no'] = friend net_sta['ed_follower_no'] = ed_follower net_sta['ed_following_no'] = ed_following # net_sta['ed_friend_no'] = ed_friend net_sta['ed_follower_p'] = ed_follower_p net_sta['ed_following_p'] = ed_following_p # net_sta['ed_friend_p'] = ed_friend_p net_sta['non_ed_follower_p'] = 1 - ed_follower_p net_sta['non_ed_following_p'] = 1 - ed_following_p # net_sta['non_ed_friend_p'] = 1 - ed_friend_p values['ed_proportion'] = net_sta com.update_one({'id': uid}, {'$set': {'net_anal': values}}, upsert=True)
def variable_change(dbname, comname, oldtimename, newtimename): db = dbt.db_connect_no_auth(dbname) com = db[comname] oldtime = db[oldtimename] newtime = db[newtimename] oldfollower, newfollower, oldfollowee, newfollowee, users, liwcs, olddate, newdate, \ oldcw, newcw, oldgw, newgw, oldage, newage, newcbmi, oldcbmi, newgbmi, oldgbmi = \ [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] # filter = {'liwc_anal.result.i':{'$exists':True}, 'new_liwc_anal.result.i':{'$exists':True}} filter = {'$or': [{'liwc_anal.result.i':{'$exists':True}}, {'new_liwc_anal.result.i':{'$exists':True}}]} # full analysis variables: # meta_keys = ['WC', 'WPS', 'Sixltr', 'Dic'] # category_keys = ['funct', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', # 'they', 'ipron', 'article', 'verb', 'auxverb', 'past', 'present', 'future', # 'adverb', 'preps', 'conj', 'negate', 'quant', 'number', 'swear', 'social', # 'family', 'friend', 'humans', 'affect', 'posemo', 'negemo', 'anx', 'anger', # 'sad', 'cogmech', 'insight', 'cause', 'discrep', 'tentat', 'certain', # 'inhib', 'incl', 'excl', 'percept', 'see', 'hear', 'feel', 'bio', 'body', # 'health', 'sexual', 'ingest', 'relativ', 'motion', 'space', 'time', 'work', # 'achieve', 'leisure', 'home', 'money', 'relig', 'death', 'assent', 'nonfl', # 'filler'] # puncuation_keys = [ # 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', # 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'AllPct'] # allcates = meta_keys + category_keys + puncuation_keys allcates = ['posemo', 'negemo', 'anx', 'anger', 'sad'] for user in com.find(filter): users.append(user['id']) # print user['id'] """LIWC variables""" oldliwc = user['liwc_anal']['result'] newliwc = user['new_liwc_anal']['result'] if newliwc is None: newliwc = {} if oldliwc == None: oldliwc = {} ols = [oldliwc.get(key, None) for key in allcates] nls = [newliwc.get(key, None) for key in allcates] liwcs.append(ols+nls) '''Follower and Followee variables''' # oldtweet = time.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1)[0] oldtweets = oldtime.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1) if oldtweets.count() == 0: oldtweets = newtime.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', 1)]).limit(1) oldtweet = oldtweets[0] oldprofile = oldtweet['user'] newtweets = newtime.find({'user.id': user['id']}, no_cursor_timeout=True).sort([('id', -1)]).limit(1) if newtweets.count() == 0: newtweet = oldtweet newprofile = oldprofile else: newtweet = newtweets[0] newprofile = newtweet['user'] olddate.append(oldtweet['created_at']) newdate.append(newtweet['created_at']) newbio = des_miner.process_text(newprofile['description']) oldbio = des_miner.process_text(oldprofile['description']) oldcw.append(oldbio.get('cw', {}).get('value', None)) newcw.append(newbio.get('cw', {}).get('value', None)) oldgw.append(oldbio.get('gw', {}).get('value', None)) newgw.append(newbio.get('gw', {}).get('value', None)) oldage.append(oldbio.get('a', {}).get('value', None)) newage.append(newbio.get('a', {}).get('value', None)) oldcbmi.append(oldbio.get('cbmi', {}).get('value', None)) newcbmi.append(newbio.get('cbmi', {}).get('value', None)) oldgbmi.append(oldbio.get('gbmi', {}).get('value', None)) newgbmi.append(newbio.get('gbmi', {}).get('value', None)) oldfollower.append(oldprofile['followers_count']) newfollower.append(newprofile['followers_count']) oldfollowee.append(oldprofile['friends_count']) newfollowee.append(newprofile['friends_count']) """Out put Profile variables""" print len(liwcs) newliwccol = ['Old'+key for key in allcates] oldliwccol = ['New'+key for key in allcates] df = pd.DataFrame(data=liwcs, columns=newliwccol+oldliwccol) df['UserID'] = users df['OldFollower'] = oldfollower df['NewFollower'] = newfollower df['OldFollowee'] = oldfollowee df['NewFollowee'] = newfollowee df['OldDate'] = olddate df['NewDate'] = newdate df['OldCW'] = oldcw df['NewCW'] = newcw df['OldGW'] = oldgw df['NewGW'] = newgw df['OldAge'] = oldage df['NewAge'] = newage df['OldCBMI'] = oldcbmi df['NewCBMI'] = newcbmi df['OldGBMI'] = oldgbmi df['NewGBMI'] = newgbmi g1 = gt.load_network_subset(dbname, 'net', {'scraped_times': 2}) g2 = gt.load_network_subset(dbname, 'net', {'scraped_times': 130}) gt.summary(g1) gt.summary(g2) oldindegree_map = dict(zip(g1.vs['name'], g1.indegree())) oldoutdegree_map = dict(zip(g1.vs['name'], g1.outdegree())) oldpagerank_map = dict(zip(g1.vs['name'], g1.pagerank())) oldbetweenness_map = dict(zip(g1.vs['name'], g1.betweenness())) newindegree_map = dict(zip(g2.vs['name'], g2.indegree())) newoutdegree_map = dict(zip(g2.vs['name'], g2.outdegree())) newpagerank_map = dict(zip(g2.vs['name'], g2.pagerank())) newbetweenness_map = dict(zip(g2.vs['name'], g2.betweenness())) df['OldIndegree'] = [oldindegree_map.get(str(uid), 0) for uid in users] df['NewIndegree'] = [newindegree_map.get(str(uid), 0) for uid in users] df['OldOutdegree'] = [oldoutdegree_map.get(str(uid), 0) for uid in users] df['NewOutdegree'] = [newoutdegree_map.get(str(uid), 0) for uid in users] df['OldPagerank'] = [oldpagerank_map.get(str(uid), 0.0) for uid in users] df['NewPagerank'] = [newpagerank_map.get(str(uid), 0.0) for uid in users] df['OldBetweenness'] = [oldbetweenness_map.get(str(uid), 0.0) for uid in users] df['NewBetweenness'] = [newbetweenness_map.get(str(uid), 0.0) for uid in users] df.to_csv(dbname+'.csv')