/
group2vec2.py
239 lines (205 loc) · 10 KB
/
group2vec2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from __future__ import division
from collections import defaultdict, Counter
import pynauty
import numpy as np
import networkx as nx
import random
from tqdm import tqdm
from multiprocessing import Pool
from gensim.models import Word2Vec
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import argparse
from sklearn import preprocessing
from utils import load_network, load_groups, normalize
def parse_args():
parser = argparse.ArgumentParser(description="Run CPNE.")
parser.add_argument('--network_format', default='adj_list', help='Input adjacency list path.')
parser.add_argument('--network', default='adj.txt', help='Input adjacency list path.')
parser.add_argument('--groups', default='group.txt', help='Input group members path.')
parser.add_argument('--group_embs', default='group_embs', help='Output group embeddings path.')
parser.add_argument('--num_workers', type=int, default=10, help='Number of workers.')
parser.add_argument('--num_walks', type=int, default=1000, help='Number of walks for generating the group corpus.')
parser.add_argument('--num_walks_trans', type=int, default=100,
help='Number of walks for generating the transition matrices with DeepWalk.')
parser.add_argument('--walk_length', type=int, default=100, help='Length of walks for generating the group corpus.')
parser.add_argument('--walk_length_trans', type=int, default=100,
help='Length of walks for generating the transition matrices with DeepWalk.')
parser.add_argument('--neg_cnt', type=int, default=5,
help='Number of negtive instances for generating the group corpus.')
parser.add_argument('--neg_cnt_trans', type=int, default=5,
help='Number of negtive instances for generating the transition matrices with DeepWalk.')
parser.add_argument('--dimension', type=int, default=128, help='Number of dimensions for group embedding.')
parser.add_argument('--dimension_trans', type=int, default=128,
help='Number of dimensions for node embedding used to generating the transition matrices.')
parser.add_argument('--window_size', type=int, default=3, help='Window size for group embedding.')
parser.add_argument('--window_size_trans', type=int, default=3,
help='Window size for node embedding used to generating the transition matrices with DeepWalk.')
parser.add_argument('--K', type=int, default=100, help='Top K nodes to save in A for each node.')
parser.add_argument('--motif_size', type=int, default=8, help='Motif size.')
parser.add_argument('--threshold', type=float, default=0.6, help='Threshold lambda to generating group corpus.')
parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate to learn group embedding.')
parser.add_argument('--weighted', dest='weighted', action='store_true',
help='Boolean specifying (un)weighted for generating the transition matrices with DeepWalk. Default is weighted.')
parser.add_argument('--unweighted', dest='unweighted', action='store_false')
parser.set_defaults(weighted=True)
parser.add_argument('--deepwalk', dest='deepwalk', action='store_true',
help='Generate transition matrix A with DeepWalk. Default is true.')
parser.set_defaults(deepwalk=True)
return parser.parse_args()
def generate_motif(motif_size):
cert2idx = {}
idx2size = {}
file_counter = open("canonical_maps/canonical_map_n%s.p" % motif_size, "rb")
canonical_map = pickle.load(file_counter)
for canonical, values in canonical_map.iteritems():
if values['idx'] > 1: # rm the case idx == 0 and idx == 1
cert2idx[canonical] = values['idx'] - 2
idx2size[values['idx'] - 2] = values['n']
return cert2idx, idx2size
def generate_transition_matrices():
corpus, groups = sample_motifs()
neighbors_motif_motif, weights_motif_motif = generate_A(corpus)
neighbors_motif_group, weights_motif_group, neighbors_group_motif, weights_group_motif = generate_B(groups)
return neighbors_motif_motif, weights_motif_motif, neighbors_motif_group, weights_motif_group, neighbors_group_motif, weights_group_motif
def sample_motifs():
corpus = []
corpus_augmented = []
gidx = num_motifs
groups = []
for am in group_ams:
groups.append({'am': am, 'gidx': gidx})
gidx += 1
pool = Pool(args.num_workers)
for _ in tqdm(range(args.num_walks_trans)):
corpus_augmented += pool.map(motif_sampler, groups)
pool.close()
pool.join()
groupsdict = defaultdict(list)
for gidx, seq in corpus_augmented:
corpus.append(seq)
for motif in seq:
groupsdict[gidx].append(motif)
groups = [] # different meaning from above
for i in range(num_groups):
groups.append(' '.join(a for a in groupsdict[i + num_motifs]))
random.shuffle(corpus)
return corpus, groups
def motif_sampler(group):
am = group['am']
gidx = group['gidx']
seq = []
size = args.motif_size
nodes = np.random.permutation(range(len(am)))[:size]
motif_am = am[np.ix_(nodes, nodes)]
motif_idx = cert2idx[get_motif(motif_am, size)]
seq.append(str(motif_idx))
while len(seq) < args.walk_length_trans:
nodes = get_next_motif_nodes(am, list(nodes))
motif_am = am[np.ix_(nodes, nodes)]
motif_idx = cert2idx[get_motif(motif_am, size)]
seq.append(str(motif_idx))
return (gidx, seq)
def get_next_motif_nodes(am, nodes):
new_node = random.choice(range(len(am)))
while new_node in nodes:
new_node = random.choice(range(len(am)))
node = random.choice(nodes)
nodes.remove(node)
nodes.append(new_node)
return nodes
def get_motif(motif_am, size):
adj_mat = {idx: [i for i in list(np.where(edge)[0]) if i != idx] for idx, edge in enumerate(motif_am)}
g = pynauty.Graph(number_of_vertices=size, directed=False, adjacency_dict=adj_mat)
cert = pynauty.certificate(g)
return cert
def generate_A(corpus):
motif_counter = Counter()
for seq in corpus:
motif_counter.update(seq)
model = Word2Vec(corpus, size=args.dimension_trans, window=args.window_size_trans, min_count=0, workers=args.num_workers)
neighbors_motif_motif = {}
weights_motif_motif = {}
for motif in motif_counter:
motif = int(motif)
neighbors, weights = zip(*model.wv.most_similar(positive=str(motif), topn=args.K))
neighbors = np.asarray(neighbors)
weights = np.asarray(weights)
idxs = np.ix_(np.where(weights > 0)[0])
neighbors_motif_motif[motif] = map(int, neighbors[idxs])
weights = weights[idxs]
weights_motif_motif[motif] = normalize(weights)
return neighbors_motif_motif, weights_motif_motif
def generate_B(groups):
neighbors_motif_group = {}
weights_motif_group = {}
neighbors_group_motif = {}
weights_group_motif = {}
vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(groups))
tfidf = tfidf.toarray()
word = np.array(vectorizer.get_feature_names())
for i in range(num_groups):
idxs = np.ix_(np.where(tfidf[i] > 0)[0])
weights_group_motif[i + num_motifs] = normalize(tfidf[i][idxs])
neighbors_group_motif[i + num_motifs] = map(int, word[idxs])
for i in range(len(word)):
motif = int(word[i])
idxs = np.ix_(np.where(tfidf[:, i] > 0)[0])
neighbors_motif_group[motif] = list(idxs[0] + num_motifs)
weights_motif_group[motif] = normalize(tfidf[idxs, i][0])
return neighbors_motif_group, weights_motif_group, neighbors_group_motif, weights_group_motif
def generate_group_corpus():
walks = []
nodes = neighbors_motif_group.keys() + neighbors_group_motif.keys()
pool = Pool(args.num_workers)
for _ in tqdm(range(args.num_walks)):
walks += pool.map(walker, nodes)
pool.close()
pool.join()
random.shuffle(walks)
walks = [map(str, walk) for walk in walks]
return walks
def walker(start_node):
walk = [start_node]
while len(walk) < args.walk_length:
cur = walk[-1]
if cur >= num_motifs and len(neighbors_group_motif[cur]) > 0:
walk.append(np.random.choice(neighbors_group_motif[cur], p = weights_group_motif[cur]))
elif cur < num_motifs and random.random() > args.threshold and len(neighbors_motif_motif[cur]) > 0:
walk.append(np.random.choice(neighbors_motif_motif[cur], p = weights_motif_motif[cur]))
elif cur < num_motifs and len(neighbors_motif_group[cur]) > 0:
walk.append(np.random.choice(neighbors_motif_group[cur], p = weights_motif_group[cur]))
else:
break
walk = np.asarray(walk)
return walk[np.ix_(np.where(walk >= num_motifs)[0])]
def compute_embs(corpus):
model = Word2Vec(corpus, size=args.dimension, window=args.window_size, min_count=0, workers=args.num_workers)
group_embs = []
for gid in range(num_groups):
group_embs.append(model[str(num_motifs + gid)])
group_embs = preprocessing.normalize(group_embs, norm='l2')
return np.array(group_embs)
def output_embs():
np.save(args.group_embs, embs)
if __name__ == '__main__':
args = parse_args()
# generatint motifs
cert2idx, idx2size = generate_motif(args.motif_size)
num_motifs = len(cert2idx)
# loading the data
G = load_network(args.network_format, args.network)
groups = load_groups(args.groups)
num_groups = len(groups)
group_ams = []
for group in groups:
group_ams.append(nx.adjacency_matrix(G.subgraph(group)).toarray())
# generating the transition matrices
neighbors_motif_motif, weights_motif_motif, neighbors_motif_group, weights_motif_group, neighbors_group_motif, weights_group_motif = generate_transition_matrices()
# generating group corpus
corpus = generate_group_corpus()
# embedding
embs = compute_embs(corpus)
output_embs()