def __init__(self, params, ancestors): super(FastHGNE, self).__init__() self.ancestors = ancestors self.params = ct.obj_dic(params) self.layer_embeddings = nn.ModuleList([nn.Embedding(self.params.num_layer_nodes[i], self.params.dims[i]) for i in range(self.params.num_layers)]) self.total_dim = np.sum(self.params.dims) self.init_var()
def optimize(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) p = ct.obj_dic(params) # read top-k with io.open(os.path.join(p.res_path, "topk_info.pkl"), "rb") as f: topk_params = pickle.load(f) top_set = set(v for k, v in topk_params["map"].items()) tmp_num = [0] def deal_subgraph(idx): print("[+] Start deal with the %d-th subgraph" % (idx + 1)) p.log.info("[+] Start deal with the %d-th subgraph" % (idx + 1)) G = gh.load_unweighted_digraph( os.path.join(p.res_path, "%d_edges" % idx), True) with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "rb") as f: sub_params = pickle.load(f) #print sub_params rmapp = {v: k for k, v in sub_params["map"].items()} for k, v in topk_params["map"].items(): rmapp[v] = k + len(sub_params["map"]) #print rmapp #print topk_params["embeddings"].shape, sub_params["embeddings"].shape #print np.concatenate((sub_params["embeddings"], topk_params["embeddings"])) params["size_subgraph"] = len(rmapp) params["num_edges_subgraph"] = G.number_of_edges() tmp_num[0] += params["num_edges_subgraph"] model_handler = __import__("model." + p.model, fromlist=["model"]) model = model_handler.NodeEmbedding( params, np.concatenate( (sub_params["embeddings"], topk_params["embeddings"])), np.concatenate((sub_params["weights"], topk_params["weights"]))) bs = __import__("batch_strategy." + p.batch_strategy, fromlist=["batch_strategy"]) get_batch = bs.batch_strategy(G, sub_params, topk_params, rmapp, p, info) embeddings, weights = model.train(get_batch) topk_params["embeddings"] = embeddings[len(rmapp) - p.num_top:] sub_params["embeddings"] = embeddings[:len(rmapp) - p.num_top] topk_params["weights"] = weights[len(rmapp) - p.num_top:] sub_params["weights"] = weights[:len(rmapp) - p.num_top] with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f: pickle.dump(sub_params, f) for i in xrange(p.num_community): deal_subgraph(i) print "real: " + str(tmp_num[0]) + ", estimate: " + str( params["num_remain_edges"]) with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f: pickle.dump(topk_params, f) return res
def __init__(self, params, w=None, c=None): p = ct.obj_dic(params) self.dim = p.dim self.lr = p.learn_rate self.k = p.num_sampled self.optimizer = p.optimizer self.epoch_num = p.epoch_num self.show_num = p.show_num self.size_subgraph = p.size_subgraph self.num_nodes = p.num_nodes self.num_edges = p.num_edges self.batch_size = p.batch_size self.logger = p.log self.tensor_graph = tf.Graph() with self.tensor_graph.as_default(): tf.set_random_seed(random.randint(0, 1e9)) self.w_id = tf.placeholder(tf.int32, shape=[None]) self.c_pos_id = tf.placeholder(tf.int32, shape=[None]) self.c_neg_id = tf.placeholder(tf.int32, shape=[None, self.k]) self.neg_weight = tf.placeholder(tf.float32, shape=[None, self.k]) self.pos_weight = tf.placeholder(tf.float32, shape=[None]) if w is None: self.w = tf.Variable(tf.random_uniform( [self.size_subgraph, self.dim], -1.0 / self.size_subgraph, 1.0 / self.size_subgraph), dtype=tf.float32) else: self.w = tf.Variable(w, dtype=tf.float32) if c is None: self.c = tf.Variable(tf.truncated_normal( [self.size_subgragh, self.embedding_size], -1.0, 1.0), dtype=tf.float32) else: self.c = tf.Variable(c, dtype=tf.float32) self.embed = tf.nn.embedding_lookup(self.w, self.w_id) self.c_pos = tf.nn.embedding_lookup(self.c, self.c_pos_id) self.c_neg = tf.nn.embedding_lookup(self.c, self.c_neg_id) self.pos_dot = tf.reduce_sum(tf.multiply(self.embed, self.c_pos), axis=1) embed_3d = tf.reshape(self.embed, [-1, 1, self.dim]) # dim: batch_size * 1 * k self.neg_dot_pre = tf.matmul(embed_3d, self.c_neg, transpose_b=True) # dim: batch_size * k self.neg_dot = tf.squeeze(self.neg_dot_pre) #self.loss = -tf.reduce_sum(tf.log_sigmoid(self.pos_dot)) - \ # tf.reduce_sum(tf.log_sigmoid(-self.neg_dot)) self.loss = -tf.reduce_mean(tf.multiply(tf.log_sigmoid(self.pos_dot), self.pos_weight)) / self.num_edges - \ tf.reduce_mean(tf.multiply(tf.log_sigmoid(-self.neg_dot), self.neg_weight)) / self.num_nodes / self.num_nodes self.train_step = getattr(tf.train, self.optimizer)( self.lr).minimize(self.loss)
def metric(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) p = ct.obj_dic(params) # load embeddings with io.open(p.embedding_path, "rb") as f: X = pickle.load(f)["embeddings"] if params["is_multilabel"]: metric_res = multilabel_classification(X, params) else: metric_res = classification(X, params) for k, v in metric_res.items(): res[k] = v return res
def __init__(self, params, w = None, c = None): p = ct.obj_dic(params) self.dim = p.dim self.lr = p.learn_rate self.k = p.num_sampled self.optimizer = p.optimizer self.epoch_num = p.epoch_num self.show_num = p.show_num self.size_subgraph = p.size_subgraph self.num_nodes = p.num_nodes if "num_remain_edges" in params: self.num_edges = p.num_remain_edges else: self.num_edges = p.num_edges self.num_edges_subgraph = p.num_edges_subgraph self.batch_size = p.batch_size self.logger = p.log self.tensor_graph = tf.Graph() with self.tensor_graph.as_default(): tf.set_random_seed(random.randint(0, 1e9)) self.w_pos_id = tf.placeholder(tf.int32, shape = [None]) self.w_neg_id = tf.placeholder(tf.int32, shape = [None]) self.c_pos_id = tf.placeholder(tf.int32, shape = [None]) self.c_neg_id = tf.placeholder(tf.int32, shape = [None]) if w is None: self.w = tf.Variable(tf.random_uniform([self.size_subgraph, self.dim], -1.0 / self.size_subgraph, 1.0 / self.size_subgraph), dtype = tf.float32) else: self.w = tf.Variable(w, dtype = tf.float32) if c is None: self.c = tf.Variable(tf.truncated_normal([self.size_subgragh, self.embedding_size], -1.0, 1.0), dtype = tf.float32) else: self.c = tf.Variable(c, dtype = tf.float32) self.embed_pos = tf.nn.embedding_lookup(self.w, self.w_pos_id) self.embed_neg = tf.nn.embedding_lookup(self.w, self.w_neg_id) self.c_pos = tf.nn.embedding_lookup(self.c, self.c_pos_id) self.c_neg = tf.nn.embedding_lookup(self.c, self.c_neg_id) self.pos_dot = tf.reduce_sum(tf.multiply(self.embed_pos, self.c_pos), axis = 1) self.neg_dot = tf.reduce_sum(tf.multiply(self.embed_neg, self.c_neg), axis = 1) self.loss = -tf.reduce_mean(tf.log_sigmoid(self.pos_dot)) - float(self.size_subgraph ** 2) / float(self.num_edges_subgraph) * tf.reduce_mean(tf.log_sigmoid(-self.neg_dot)) self.train_step = getattr(tf.train, self.optimizer)(self.lr).minimize(self.loss)
def merge_embedding(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) p = ct.obj_dic(params) # load embeddings # TODO use redis embeddings = np.empty((p.num_nodes, p.dim), dtype=np.float32) weights = np.empty((p.num_nodes, p.dim), dtype=np.float32) def read_embeddings(path): with io.open(os.path.join(info["res_home"], path), "rb") as f: sub_params = pickle.load(f) for k, v in sub_params["map"].items(): embeddings[v, :] = sub_params["embeddings"][k, :] weights[v, :] = sub_params["weights"][k, :] read_embeddings("topk_info.pkl") for i in xrange(p.num_community): read_embeddings("%d_info.pkl" % i) with io.open(p.save_path, "wb") as f: pickle.dump({"embeddings": embeddings, "weights": weights}, f) #print embeddings #print weights return res
def graph_split(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) for k, v in info["network_folder"].items(): if k == "name": continue shutil.copy(os.path.join(params["folder_path"], v), res["train_path"]) shutil.copy(os.path.join(params["folder_path"], v), res["test_path"]) #folder_info = {k: os.path.join(info["home_path"], v) for k, v in info["network_folder"].items()} folder_info = ct.obj_dic(info["network_folder"]) node_list = [] with open(os.path.join(params["folder_path"], folder_info.entity), "r", encoding="gb2312") as f: for line in f: items = line_init(line, 2) if items is None: continue node_list.append((int(items[1]), items[0])) n = len(node_list) n_test = int(float(n) * params["test_ratio"]) n_train = n - n_test random.shuffle(node_list) node_dic = {it[0]: [idx, it[1]] for idx, it in enumerate(node_list)} # write entity file f_test = open(os.path.join(res["test_path"], folder_info.entity), "w", encoding="gb2312") f_train = open(os.path.join(res["train_path"], folder_info.entity), "w", encoding="gb2312") for k, v in node_dic.items(): f_test.write("%s %d\n" % (v[1], v[0])) if v[0] < n_train: f_train.write("%s %d\n" % (v[1], v[0])) f_test.close() f_train.close() # write edge file f_test = open(os.path.join(res["test_path"], folder_info.edge), "w") f_train = open(os.path.join(res["train_path"], folder_info.edge), "w") with open(os.path.join(params["folder_path"], folder_info.edge), "r") as f: for line in f: items = line_init(line, 2) if items is None: continue it = [node_dic[int(i)][0] for i in items] f_test.write("%d %d\n" % (it[0], it[1])) if it[0] < n_train and it[1] < n_train: f_train.write("%d %d\n" % (it[0], it[1])) for i in range(n_train): f_test.write("%d %d\n" % (i, i)) f_train.write("%d %d\n" % (i, i)) for i in range(n_train, n): f_test.write("%d %d\n" % (i, i)) f_test.close() f_train.close() # write mix_edge file f_test = open(os.path.join(res["test_path"], folder_info.mix_edge), "w") f_train = open(os.path.join(res["train_path"], folder_info.mix_edge), "w") with open(os.path.join(params["folder_path"], folder_info.mix_edge), "r") as f: for line in f: items = line_init(line, 2) if items is None: continue items[0] = node_dic[int(items[0])][0] f_test.write("%d %s\n" % (items[0], items[1])) if items[0] < n_train: f_train.write("%d %s\n" % (items[0], items[1])) f_test.close() f_train.close() # write label f_test = open(os.path.join(res["test_path"], folder_info.label), "w", encoding="gb2312") f_train = open(os.path.join(res["train_path"], folder_info.label), "w", encoding="gb2312") label_list = [] with open(os.path.join(params["folder_path"], folder_info.label), "r", encoding="gb2312") as f: for line in f: items = line_init(line, 1) if items is None: continue label_list.append(items[0]) for i in range(n_train): f_test.write("%s\n" % label_list[node_list[i][0]]) f_train.write("%s\n" % label_list[node_list[i][0]]) for i in range(n_train, n): f_test.write("%s\n" % label_list[node_list[i][0]]) f_test.close() f_train.close() #write entity_features with open(os.path.join(params["folder_path"], folder_info.entity_features), "rb") as f: features = pickle.load(f) idx_list, _ = zip(*node_list) with open(os.path.join(res["test_path"], folder_info.entity_features), "wb") as f: pickle.dump(features[idx_list, :], f) with open(os.path.join(res["train_path"], folder_info.entity_features), "wb") as f: pickle.dump(features[idx_list[:n_train], :], f) return res
def init(params, info, **kwargs): res = params_handler(params, info) p = ct.obj_dic(params) ''' # top-k nodes q = pq() for idx, u in enumerate(G): if idx < p.num_top: q.put_nowait((G.node[u]["in_degree"], u)) else: tmp = q.get_nowait() if tmp[0] <= G.node[u]["in_degree"]: q.put_nowait((G.node[u]["in_degree"], u)) else: q.put_nowait(tmp) top_lst = [] top_set = set() while not q.empty(): top_lst.append(q.get_nowait()[1]) top_set.add(top_lst[-1]) print top_lst ''' top_out_lst = [] top_in_lst = [] r =redis.Redis(host='localhost',port=6379) r.config_set('maxmemory',p.redis_maxmemory) with open(p.network_path, "r") as f: r.zremrangebyrank("out1", 0, -1) r.zremrangebyrank("in1", 0, -1) r.set("degree",0) for line in f: if len(line) == 0: continue items = line.split() if len(items) != 2: continue if not p.is_directed: r.zincrby("in1",1,items[0]) r.zincrby("out1",1,items[1]) r.incr("degree") r.zincrby("out1",1,items[0]) r.zincrby("in1",1,items[1]) r.incr("degree") r.zrevrange("out1", 0, -1, withscores=False) r.zrevrange("in1", 0, -1, withscores=False) #used_memory = r.info()['used_memory'] #print "used_memory(Byte): %s" % used_memory top_out_lst = r.zrange("out1",0,p.num_top-1,desc=True) top_in_lst = r.zrange("in1",0,p.num_top-1,desc=True) top_in_lst = map(eval, top_in_lst) #redis list str to int top_out_lst = map(eval, top_out_lst) top_set = set() for i in top_in_lst: top_set.add(i) remain_size = r.zcard("out1")-p.num_top num_community = remain_size // p.community_size if remain_size % p.community_size != 0: num_community += 1 topk_params = {"embeddings" : pi.initialize_embeddings(p.num_top, p.dim), "weights" : pi.initialize_weights(p.num_top, p.dim), #"in_degree": [G.node[i]["in_degree"] for i in top_in_lst], #"out_degree": [G.node[i]["out_degree"] for i in top_out_lst], "in_degree": [r.zscore("in1",i) for i in top_in_lst], "out_degree": [r.zscore("out1",i) for i in top_out_lst], "map" : {i : top_in_lst[i] for i in xrange(len(top_in_lst))}} print topk_params with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f: pickle.dump(topk_params, f) def deal_subgraph(idx, st, ed): sub_params = {"embeddings": pi.initialize_embeddings(ed - st, p.dim), "weights": pi.initialize_weights(ed - st, p.dim), #"map" : {i : node_lst[st + i] for i in xrange(ed - st)}} "map" : {i : int(r.zrange("out1",st+i,st+i)[0]) for i in xrange(ed - st)}} print sub_params with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f: pickle.dump(sub_params, f) for i in xrange(num_community): deal_subgraph(i, i * p.community_size, min((i + 1) * p.community_size, remain_size)) # calculate prob def cal_q1(): K = float(num_community) na = float(p.community_size) n = p.num_nodes - p.num_top nr = float(n % p.community_size) n = float(n) return (K - 1) * na / n * (na - 1) / (n - 1) + nr * (nr - 1) / n / (n - 1) info["q"] = [cal_q1(), 1.0, float(num_community)] tmp = p.num_nodes - p.num_top info["Z"] = [0.0, info["q"][0] * tmp * tmp + \ tmp * p.num_top + info["q"][2] * p.num_top * p.num_top] with open(p.network_path, "r") as f: for line in f: if len(line) == 0: continue items = line.split() if len(items) != 2: continue if int(items[0]) in top_set and int(items[1]) in top_set: info["Z"][0] += info["q"][2] elif int(items[0]) in top_set or int(items[1]) in top_set: info["Z"][0] += 1 else: info["Z"][0] += info["q"][0] if not p.is_directed: if int(items[1]) in top_set and int(items[0]) in top_set: info["Z"][0] += info["q"][2] elif int(items[1]) in top_set or int(items[0]) in top_set: info["Z"][0] += 1 else: info["Z"][0] += info["q"][0] ''' for e in G.edges(): if e[0] in top_set and e[1] in top_set: info["Z"][0] += info["q"][2] elif e[0] in top_set or e[1] in top_set: info["Z"][0] += 1 else: info["Z"][0] += info["q"][0] ''' info["total_degree"] = r.get("degree") info["num_community"] = num_community res["data_path"] = p.res_path print info return res
def split_graph(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) p = ct.obj_dic(params) # read top-k with io.open(os.path.join(p.data_path, "topk_info.pkl"), "rb") as f: topk_params = pickle.load(f) top_set = set(v for k, v in topk_params["map"].items()) #get node lst G = gh.load_unweighted_digraph(p.network_path, p.is_directed) node_lst = [] for u in G: if u not in top_set: node_lst.append(u) random.shuffle(node_lst) #print node_lst #group = {u : idx / p.community_size for idx, u in enumerate(node_lst)} tmp = p.community_size_small * p.num_community_small group = {} for idx, u in enumerate(node_lst): if idx < tmp: group[u] = idx // p.community_size_small else: group[u] = p.num_community_small + (idx - tmp) // p.community_size_large #print group tmp_files = [FileOutstream(os.path.join(p.tmp_path, "%d" % i)) for i in xrange(p.num_community)] for i in xrange(p.num_community): with io.open(os.path.join(p.data_path, "%d_info.pkl" % i), "rb") as f: sub_params = pickle.load(f) for j in sub_params["map"]: s = json.dumps((sub_params["embeddings"][j].tolist(), sub_params["weights"][j].tolist(), sub_params["map"][j], sub_params["in_degree"][j], sub_params["out_degree"][j])) #print s u = sub_params["map"][j] tmp_files[group[u]].writeline(s) del tmp_files gc.collect() num_ignore = 0 edge_files = [FileOutstream(os.path.join(p.res_path, "%d_edges" % i)) for i in xrange(p.num_community)] topk_edge_file = FileOutstream(os.path.join(p.res_path, "topk_edges")) for e in G.edges(): if e[0] in top_set and e[1] in top_set: topk_edge_file.write("%d\t%d\n" % e) elif e[0] in top_set: edge_files[group[e[1]]].write("%d\t%d\n" % e) elif e[1] in top_set or group[e[0]] == group[e[1]]: edge_files[group[e[0]]].write("%d\t%d\n" % e) else: num_ignore += 1 print "Number of ignored edges: " + str(num_ignore) print "Number of edges: " + str(len(G.edges())) del edge_files del topk_edge_file gc.collect() for i in xrange(p.num_community): embeddings = [] weights = [] mapp = {} inds = [] outds = [] with io.open(os.path.join(p.tmp_path, "%d" % i), "rb") as f: for idx, line in enumerate(f): line = line.strip() if len(line) == 0: continue embed, weight, u, ind, outd = json.loads(line) embeddings.append(embed) weights.append(weight) mapp[idx] = u outds.append(outd) inds.append(ind) sub_params = {"embeddings": np.array(embeddings), "weights": np.array(weights), "map": mapp, "in_degree": inds, "out_degree": outds} #print sub_params with io.open(os.path.join(p.res_path, "%d_info.pkl" % i), "wb") as f: pickle.dump(sub_params, f) #print sub_params #res["data_path"] = p.res_path res["num_ignore"] = num_ignore return res
def init(params, info, **kwargs): res = params_handler(params, info) p = ct.obj_dic(params) G = gh.load_unweighted_digraph(p.network_path, p.is_directed) info["num_edges"] = len(G.edges()) # top-k nodes q = pq() for idx, u in enumerate(G): if idx < p.num_top: q.put_nowait((G.node[u]["in_degree"], u)) else: tmp = q.get_nowait() if tmp[0] <= G.node[u]["in_degree"]: q.put_nowait((G.node[u]["in_degree"], u)) else: q.put_nowait(tmp) top_lst = [] top_set = set() while not q.empty(): top_lst.append(q.get_nowait()[1]) top_set.add(top_lst[-1]) print "top_lst: " + str(top_lst) node_lst = [] for u in G: if u not in top_set: node_lst.append(u) remain_size = len(node_lst) num_community = (remain_size + p.community_bound - 1) // p.community_bound num_community_large = remain_size % num_community num_community_small = num_community - num_community_large community_size_small = remain_size // num_community community_size_large = community_size_small + 1 #print remain_size, num_community, num_community_small, num_community_large, community_size_small, community_size_large topk_params = { "embeddings": pi.initialize_embeddings(p.num_top, p.dim), "weights": pi.initialize_weights(p.num_top, p.dim), "in_degree": [G.node[i]["in_degree"] for i in top_lst], "out_degree": [G.node[i]["out_degree"] for i in top_lst], "map": {i: top_lst[i] for i in xrange(len(top_lst))} } #print topk_params with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f: pickle.dump(topk_params, f) def deal_subgraph(idx, st, ed): sub_params = { "embeddings": pi.initialize_embeddings(ed - st, p.dim), "weights": pi.initialize_weights(ed - st, p.dim), "in_degree": [G.node[node_lst[st + i]]["in_degree"] for i in xrange(ed - st)], "out_degree": [G.node[node_lst[st + i]]["out_degree"] for i in xrange(ed - st)], "map": {i: node_lst[st + i] for i in xrange(ed - st)} } #print sub_params with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f: pickle.dump(sub_params, f) for i in xrange(num_community_small): deal_subgraph(i, i * community_size_small, (i + 1) * community_size_small) tmp = num_community_small * community_size_small for i in xrange(num_community_small, num_community): deal_subgraph( i, tmp + (i - num_community_small) * community_size_large, tmp + (i - num_community_small + 1) * community_size_large) info["num_community"] = num_community info["num_community_small"] = num_community_small info["num_community_large"] = num_community_large info["community_size_small"] = community_size_small info["community_size_large"] = community_size_large #print info # calculate prob def cal_q1(): K = float(num_community) nl = float(community_size_small) nr = nl + 1 n = float(p.num_nodes - p.num_top) nh = float(community_size_large) Kl = float(num_community_small) Kh = float(num_community_large) return Kl * nl / n * (nl - 1) / (n - 1) + Kh * nh / n * (nh - 1) / (n - 1) info["q"] = [cal_q1(), 1.0, float(num_community) if p.q2 is None else p.q2] tmp = p.num_nodes - p.num_top info["Z"] = [0.0, info["q"][0] * tmp * tmp + \ 2.0 * tmp * p.num_top + info["q"][2] * p.num_top * p.num_top] info["num_topk_edges"] = 0 for e in G.edges(): if e[0] in top_set and e[1] in top_set: info["Z"][0] += info["q"][2] info["num_topk_edges"] += 1 elif e[0] in top_set or e[1] in top_set: info["Z"][0] += 1 else: info["Z"][0] += info["q"][0] info["total_degree"] = G.graph["degree"] info["num_community"] = num_community res["data_path"] = p.res_path print "Info: ", info["q"], info["Z"] #print "End!!" return res
def split_graph(params, info, pre_res, **kwargs): r =redis.Redis(host='localhost',port=6379) res = params_handler(params, info, pre_res) p = ct.obj_dic(params) # read top-k with io.open(os.path.join(p.data_path, "topk_info.pkl"), "rb") as f: topk_params = pickle.load(f) top_set = set(v for k, v in topk_params["map"].items()) #get node lst shufflenode_len = r.llen("shuffle_node") for i in range(0,shufflenode_len): r.lpop("shuffle_node") #由于考虑的是无向图,就先把in当做tmp for i in range(0,r.zcard("in1")-p.num_top): range_index=random.randint(0,int(r.zcard("in1")-p.num_top-1)) r.rpush("shuffle_node",int(r.zrange("in1",range_index,range_index)[0])) r.zremrangebyrank("in1",range_index,range_index) print map(eval,r.lrange("shuffle_node",0,-1)) #used_memory = r.info()['used_memory'] #print "used_memory(Byte): %s" % used_memory #group = {u : idx / p.community_size for idx, u in enumerate(node_lst)} group = {u : idx / p.community_size for idx, u in enumerate(map(eval,r.lrange("shuffle_node",0,-1)))} print group tmp_files = [FileOutstream(os.path.join(p.tmp_path, "%d" % i)) for i in xrange(p.num_community)] for i in xrange(p.num_community): with io.open(os.path.join(p.data_path, "%d_info.pkl" % i), "rb") as f: sub_params = pickle.load(f) for j in sub_params["map"]: s = json.dumps((sub_params["embeddings"][j].tolist(), sub_params["weights"][j].tolist(), sub_params["map"][j])) print s u = sub_params["map"][j] tmp_files[group[u]].writeline(s) del tmp_files num_ignore = 0 edge_files = [FileOutstream(os.path.join(p.res_path, "%d_edges" % i)) for i in xrange(p.num_community)] ''' for e in G.edges(): if e[0] in top_set and e[1] in top_set: for idx, f in enumerate(edge_files): edge_files[idx].write("%d\t%d\n" % e) elif e[0] in top_set: edge_files[group[e[1]]].write("%d\t%d\n" % e) elif e[1] in top_set or group[e[0]] == group[e[1]]: edge_files[group[e[0]]].write("%d\t%d\n" % e) else: num_ignore += 1 ''' with open(p.network_path, "r") as f: for line in f: if len(line) == 0: continue items = line.split() if len(items) != 2: continue if int(items[0]) in top_set and int(items[1]) in top_set: for idx, f in enumerate(edge_files): edge_files[idx].write("%d\t%d\n" % (int(items[0]),int(items[1]))) elif int(items[0]) in top_set: edge_files[group[int(items[1])]].write("%d\t%d\n" % (int(items[0]),int(items[1]))) elif int(items[1]) in top_set or group[int(items[0])] == group[int(items[1])]: edge_files[group[int(items[0])]].write("%d\t%d\n" % (int(items[0]),int(items[1]))) else: num_ignore += 1 if not p.is_directed: if int(items[1]) in top_set and int(items[0]) in top_set: for idx, f in enumerate(edge_files): edge_files[idx].write("%d\t%d\n" % (int(items[1]),int(items[0]))) elif int(items[1]) in top_set: edge_files[group[int(items[0])]].write("%d\t%d\n" % (int(items[1]),int(items[0]))) elif int(items[0]) in top_set or group[int(items[1])] == group[int(items[0])]: edge_files[group[int(items[1])]].write("%d\t%d\n" % (int(items[1]),int(items[0]))) else: num_ignore += 1 print num_ignore del edge_files for i in xrange(p.num_community): embeddings = [] weights = [] mapp = {} with io.open(os.path.join(p.tmp_path, "%d" % i), "rb") as f: for idx, line in enumerate(f): line = line.strip() if len(line) == 0: continue embed, weight, u = json.loads(line) embeddings.append(embed) weights.append(weight) mapp[idx] = u sub_params = {"embeddings": np.array(embeddings), "weights": np.array(weights), "map": mapp} print sub_params with io.open(os.path.join(p.res_path, "%d_info.pkl" % i), "wb") as f: pickle.dump(sub_params, f) #res["data_path"] = p.res_path return res