Пример #1
0
 def __init__(self, params, ancestors):
     super(FastHGNE, self).__init__()
     self.ancestors = ancestors
     self.params = ct.obj_dic(params)
     self.layer_embeddings = nn.ModuleList([nn.Embedding(self.params.num_layer_nodes[i], self.params.dims[i]) for i in range(self.params.num_layers)])
     self.total_dim = np.sum(self.params.dims)
     self.init_var()
Пример #2
0
def optimize(params, info, pre_res, **kwargs):
    res = params_handler(params, info, pre_res)
    p = ct.obj_dic(params)

    # read top-k
    with io.open(os.path.join(p.res_path, "topk_info.pkl"), "rb") as f:
        topk_params = pickle.load(f)
    top_set = set(v for k, v in topk_params["map"].items())

    tmp_num = [0]

    def deal_subgraph(idx):
        print("[+] Start deal with the %d-th subgraph" % (idx + 1))
        p.log.info("[+] Start deal with the %d-th subgraph" % (idx + 1))
        G = gh.load_unweighted_digraph(
            os.path.join(p.res_path, "%d_edges" % idx), True)
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "rb") as f:
            sub_params = pickle.load(f)
        #print sub_params
        rmapp = {v: k for k, v in sub_params["map"].items()}
        for k, v in topk_params["map"].items():
            rmapp[v] = k + len(sub_params["map"])

        #print rmapp
        #print topk_params["embeddings"].shape, sub_params["embeddings"].shape
        #print np.concatenate((sub_params["embeddings"], topk_params["embeddings"]))
        params["size_subgraph"] = len(rmapp)
        params["num_edges_subgraph"] = G.number_of_edges()
        tmp_num[0] += params["num_edges_subgraph"]

        model_handler = __import__("model." + p.model, fromlist=["model"])
        model = model_handler.NodeEmbedding(
            params,
            np.concatenate(
                (sub_params["embeddings"], topk_params["embeddings"])),
            np.concatenate((sub_params["weights"], topk_params["weights"])))

        bs = __import__("batch_strategy." + p.batch_strategy,
                        fromlist=["batch_strategy"])
        get_batch = bs.batch_strategy(G, sub_params, topk_params, rmapp, p,
                                      info)

        embeddings, weights = model.train(get_batch)
        topk_params["embeddings"] = embeddings[len(rmapp) - p.num_top:]
        sub_params["embeddings"] = embeddings[:len(rmapp) - p.num_top]
        topk_params["weights"] = weights[len(rmapp) - p.num_top:]
        sub_params["weights"] = weights[:len(rmapp) - p.num_top]
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f:
            pickle.dump(sub_params, f)

    for i in xrange(p.num_community):
        deal_subgraph(i)
    print "real: " + str(tmp_num[0]) + ", estimate: " + str(
        params["num_remain_edges"])
    with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f:
        pickle.dump(topk_params, f)
    return res
Пример #3
0
    def __init__(self, params, w=None, c=None):
        p = ct.obj_dic(params)
        self.dim = p.dim
        self.lr = p.learn_rate
        self.k = p.num_sampled
        self.optimizer = p.optimizer
        self.epoch_num = p.epoch_num
        self.show_num = p.show_num
        self.size_subgraph = p.size_subgraph
        self.num_nodes = p.num_nodes
        self.num_edges = p.num_edges
        self.batch_size = p.batch_size
        self.logger = p.log

        self.tensor_graph = tf.Graph()
        with self.tensor_graph.as_default():
            tf.set_random_seed(random.randint(0, 1e9))
            self.w_id = tf.placeholder(tf.int32, shape=[None])
            self.c_pos_id = tf.placeholder(tf.int32, shape=[None])
            self.c_neg_id = tf.placeholder(tf.int32, shape=[None, self.k])
            self.neg_weight = tf.placeholder(tf.float32, shape=[None, self.k])
            self.pos_weight = tf.placeholder(tf.float32, shape=[None])

            if w is None:
                self.w = tf.Variable(tf.random_uniform(
                    [self.size_subgraph, self.dim], -1.0 / self.size_subgraph,
                    1.0 / self.size_subgraph),
                                     dtype=tf.float32)
            else:
                self.w = tf.Variable(w, dtype=tf.float32)
            if c is None:
                self.c = tf.Variable(tf.truncated_normal(
                    [self.size_subgragh, self.embedding_size], -1.0, 1.0),
                                     dtype=tf.float32)
            else:
                self.c = tf.Variable(c, dtype=tf.float32)

            self.embed = tf.nn.embedding_lookup(self.w, self.w_id)
            self.c_pos = tf.nn.embedding_lookup(self.c, self.c_pos_id)
            self.c_neg = tf.nn.embedding_lookup(self.c, self.c_neg_id)

            self.pos_dot = tf.reduce_sum(tf.multiply(self.embed, self.c_pos),
                                         axis=1)
            embed_3d = tf.reshape(self.embed, [-1, 1, self.dim])
            # dim: batch_size * 1 * k
            self.neg_dot_pre = tf.matmul(embed_3d,
                                         self.c_neg,
                                         transpose_b=True)
            # dim: batch_size * k
            self.neg_dot = tf.squeeze(self.neg_dot_pre)
            #self.loss = -tf.reduce_sum(tf.log_sigmoid(self.pos_dot)) - \
            #        tf.reduce_sum(tf.log_sigmoid(-self.neg_dot))
            self.loss = -tf.reduce_mean(tf.multiply(tf.log_sigmoid(self.pos_dot), self.pos_weight)) / self.num_edges - \
                    tf.reduce_mean(tf.multiply(tf.log_sigmoid(-self.neg_dot), self.neg_weight)) / self.num_nodes / self.num_nodes
            self.train_step = getattr(tf.train, self.optimizer)(
                self.lr).minimize(self.loss)
Пример #4
0
def metric(params, info, pre_res, **kwargs):
    res = params_handler(params, info, pre_res)
    p = ct.obj_dic(params)
    # load embeddings
    with io.open(p.embedding_path, "rb") as f:
        X = pickle.load(f)["embeddings"]

    if params["is_multilabel"]:
        metric_res = multilabel_classification(X, params)
    else:
        metric_res = classification(X, params)
    for k, v in metric_res.items():
        res[k] = v
    return res
Пример #5
0
    def __init__(self, params, w = None, c = None):
        p = ct.obj_dic(params)
        self.dim = p.dim
        self.lr = p.learn_rate
        self.k = p.num_sampled
        self.optimizer = p.optimizer
        self.epoch_num = p.epoch_num
        self.show_num = p.show_num
        self.size_subgraph = p.size_subgraph
        self.num_nodes = p.num_nodes
        if "num_remain_edges" in params:
            self.num_edges = p.num_remain_edges
        else:
            self.num_edges = p.num_edges
        self.num_edges_subgraph = p.num_edges_subgraph
        self.batch_size = p.batch_size
        self.logger = p.log

        self.tensor_graph = tf.Graph()
        with self.tensor_graph.as_default():
            tf.set_random_seed(random.randint(0, 1e9))
            self.w_pos_id = tf.placeholder(tf.int32, shape = [None])
            self.w_neg_id = tf.placeholder(tf.int32, shape = [None])
            self.c_pos_id = tf.placeholder(tf.int32, shape = [None])
            self.c_neg_id = tf.placeholder(tf.int32, shape = [None])

            if w is None:
                self.w = tf.Variable(tf.random_uniform([self.size_subgraph, self.dim], -1.0 / self.size_subgraph, 1.0 / self.size_subgraph), dtype = tf.float32)
            else:
                self.w = tf.Variable(w, dtype = tf.float32)
            if c is None:
                self.c = tf.Variable(tf.truncated_normal([self.size_subgragh, self.embedding_size], -1.0, 1.0), dtype = tf.float32)
            else:
                self.c = tf.Variable(c, dtype = tf.float32)

            self.embed_pos = tf.nn.embedding_lookup(self.w, self.w_pos_id)
            self.embed_neg = tf.nn.embedding_lookup(self.w, self.w_neg_id)
            self.c_pos = tf.nn.embedding_lookup(self.c, self.c_pos_id)
            self.c_neg = tf.nn.embedding_lookup(self.c, self.c_neg_id)
            
            self.pos_dot = tf.reduce_sum(tf.multiply(self.embed_pos, self.c_pos), axis = 1)
            self.neg_dot = tf.reduce_sum(tf.multiply(self.embed_neg, self.c_neg), axis = 1)
            
            
            self.loss = -tf.reduce_mean(tf.log_sigmoid(self.pos_dot)) - float(self.size_subgraph ** 2) / float(self.num_edges_subgraph) * tf.reduce_mean(tf.log_sigmoid(-self.neg_dot))
            self.train_step =  getattr(tf.train, self.optimizer)(self.lr).minimize(self.loss)
Пример #6
0
def merge_embedding(params, info, pre_res, **kwargs):
    res = params_handler(params, info, pre_res)
    p = ct.obj_dic(params)
    # load embeddings
    # TODO use redis
    embeddings = np.empty((p.num_nodes, p.dim), dtype=np.float32)
    weights = np.empty((p.num_nodes, p.dim), dtype=np.float32)

    def read_embeddings(path):
        with io.open(os.path.join(info["res_home"], path), "rb") as f:
            sub_params = pickle.load(f)
            for k, v in sub_params["map"].items():
                embeddings[v, :] = sub_params["embeddings"][k, :]
                weights[v, :] = sub_params["weights"][k, :]

    read_embeddings("topk_info.pkl")
    for i in xrange(p.num_community):
        read_embeddings("%d_info.pkl" % i)

    with io.open(p.save_path, "wb") as f:
        pickle.dump({"embeddings": embeddings, "weights": weights}, f)
    #print embeddings
    #print weights
    return res
Пример #7
0
def graph_split(params, info, pre_res, **kwargs):
    res = params_handler(params, info, pre_res)
    for k, v in info["network_folder"].items():
        if k == "name":
            continue
        shutil.copy(os.path.join(params["folder_path"], v), res["train_path"])
        shutil.copy(os.path.join(params["folder_path"], v), res["test_path"])
    #folder_info = {k: os.path.join(info["home_path"], v) for k, v in info["network_folder"].items()}
    folder_info = ct.obj_dic(info["network_folder"])
    node_list = []
    with open(os.path.join(params["folder_path"], folder_info.entity),
              "r",
              encoding="gb2312") as f:
        for line in f:
            items = line_init(line, 2)
            if items is None:
                continue
            node_list.append((int(items[1]), items[0]))

    n = len(node_list)
    n_test = int(float(n) * params["test_ratio"])
    n_train = n - n_test

    random.shuffle(node_list)
    node_dic = {it[0]: [idx, it[1]] for idx, it in enumerate(node_list)}
    # write entity file
    f_test = open(os.path.join(res["test_path"], folder_info.entity),
                  "w",
                  encoding="gb2312")
    f_train = open(os.path.join(res["train_path"], folder_info.entity),
                   "w",
                   encoding="gb2312")
    for k, v in node_dic.items():
        f_test.write("%s %d\n" % (v[1], v[0]))
        if v[0] < n_train:
            f_train.write("%s %d\n" % (v[1], v[0]))
    f_test.close()
    f_train.close()

    # write edge file
    f_test = open(os.path.join(res["test_path"], folder_info.edge), "w")
    f_train = open(os.path.join(res["train_path"], folder_info.edge), "w")
    with open(os.path.join(params["folder_path"], folder_info.edge), "r") as f:
        for line in f:
            items = line_init(line, 2)
            if items is None:
                continue
            it = [node_dic[int(i)][0] for i in items]
            f_test.write("%d %d\n" % (it[0], it[1]))
            if it[0] < n_train and it[1] < n_train:
                f_train.write("%d %d\n" % (it[0], it[1]))
    for i in range(n_train):
        f_test.write("%d %d\n" % (i, i))
        f_train.write("%d %d\n" % (i, i))
    for i in range(n_train, n):
        f_test.write("%d %d\n" % (i, i))

    f_test.close()
    f_train.close()

    # write mix_edge file
    f_test = open(os.path.join(res["test_path"], folder_info.mix_edge), "w")
    f_train = open(os.path.join(res["train_path"], folder_info.mix_edge), "w")
    with open(os.path.join(params["folder_path"], folder_info.mix_edge),
              "r") as f:
        for line in f:
            items = line_init(line, 2)
            if items is None:
                continue
            items[0] = node_dic[int(items[0])][0]
            f_test.write("%d %s\n" % (items[0], items[1]))
            if items[0] < n_train:
                f_train.write("%d %s\n" % (items[0], items[1]))
    f_test.close()
    f_train.close()

    # write label
    f_test = open(os.path.join(res["test_path"], folder_info.label),
                  "w",
                  encoding="gb2312")
    f_train = open(os.path.join(res["train_path"], folder_info.label),
                   "w",
                   encoding="gb2312")
    label_list = []
    with open(os.path.join(params["folder_path"], folder_info.label),
              "r",
              encoding="gb2312") as f:
        for line in f:
            items = line_init(line, 1)
            if items is None:
                continue
            label_list.append(items[0])
    for i in range(n_train):
        f_test.write("%s\n" % label_list[node_list[i][0]])
        f_train.write("%s\n" % label_list[node_list[i][0]])

    for i in range(n_train, n):
        f_test.write("%s\n" % label_list[node_list[i][0]])
    f_test.close()
    f_train.close()

    #write entity_features
    with open(os.path.join(params["folder_path"], folder_info.entity_features),
              "rb") as f:
        features = pickle.load(f)

    idx_list, _ = zip(*node_list)
    with open(os.path.join(res["test_path"], folder_info.entity_features),
              "wb") as f:
        pickle.dump(features[idx_list, :], f)
    with open(os.path.join(res["train_path"], folder_info.entity_features),
              "wb") as f:
        pickle.dump(features[idx_list[:n_train], :], f)

    return res
Пример #8
0
def init(params, info, **kwargs):
    res = params_handler(params, info)
    p = ct.obj_dic(params)

    '''
    # top-k nodes
    q = pq()
    for idx, u in enumerate(G):
        if idx < p.num_top:
            q.put_nowait((G.node[u]["in_degree"], u))
        else:
            tmp = q.get_nowait()
            if tmp[0] <= G.node[u]["in_degree"]:
                q.put_nowait((G.node[u]["in_degree"], u))
            else:
                q.put_nowait(tmp)
    top_lst = []
    top_set = set()
    while not q.empty():
        top_lst.append(q.get_nowait()[1])
        top_set.add(top_lst[-1])
    print top_lst
    '''
    top_out_lst = []
    top_in_lst = []

    r =redis.Redis(host='localhost',port=6379)
    r.config_set('maxmemory',p.redis_maxmemory)
    with open(p.network_path, "r") as f:
        r.zremrangebyrank("out1", 0, -1)
        r.zremrangebyrank("in1", 0, -1)
        r.set("degree",0)
        for line in f:
            if len(line) == 0:
                continue
            items = line.split()
            if len(items) != 2:
                continue
            if not p.is_directed:
                r.zincrby("in1",1,items[0])
                r.zincrby("out1",1,items[1])
                r.incr("degree")
            r.zincrby("out1",1,items[0])
            r.zincrby("in1",1,items[1])
            r.incr("degree")

    r.zrevrange("out1", 0, -1, withscores=False)
    r.zrevrange("in1", 0, -1, withscores=False)
    #used_memory = r.info()['used_memory']
    #print "used_memory(Byte): %s" % used_memory

    top_out_lst = r.zrange("out1",0,p.num_top-1,desc=True)
    top_in_lst =  r.zrange("in1",0,p.num_top-1,desc=True)

    top_in_lst = map(eval, top_in_lst)   #redis list str to int
    top_out_lst = map(eval, top_out_lst) 
 
    top_set = set()
    for i in top_in_lst:
        top_set.add(i)
    
    remain_size = r.zcard("out1")-p.num_top

    num_community = remain_size // p.community_size
    if remain_size % p.community_size != 0:
        num_community += 1
    topk_params = {"embeddings" : pi.initialize_embeddings(p.num_top, p.dim),
            "weights" : pi.initialize_weights(p.num_top, p.dim),
            
            #"in_degree": [G.node[i]["in_degree"] for i in top_in_lst],
            #"out_degree": [G.node[i]["out_degree"] for i in top_out_lst],

            "in_degree": [r.zscore("in1",i) for i in top_in_lst],
            "out_degree": [r.zscore("out1",i) for i in top_out_lst],

            "map" : {i : top_in_lst[i]  for i in xrange(len(top_in_lst))}}  
    print topk_params
    with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f:
        pickle.dump(topk_params, f)

    def deal_subgraph(idx, st, ed):
        sub_params = {"embeddings": pi.initialize_embeddings(ed - st, p.dim),
                "weights": pi.initialize_weights(ed - st, p.dim),
                #"map" : {i : node_lst[st + i] for i in xrange(ed - st)}}
                "map" : {i : int(r.zrange("out1",st+i,st+i)[0]) for i in xrange(ed - st)}}
        print sub_params
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f:
            pickle.dump(sub_params, f)

    for i in xrange(num_community):
        deal_subgraph(i, i * p.community_size, min((i + 1) * p.community_size, remain_size))

    # calculate prob
    def cal_q1():
        K = float(num_community)
        na = float(p.community_size)
        n = p.num_nodes - p.num_top
        nr = float(n % p.community_size)
        n = float(n)
        return (K - 1) * na / n * (na - 1) / (n - 1) + nr * (nr - 1) / n / (n - 1)

    info["q"] = [cal_q1(), 1.0, float(num_community)]
    tmp = p.num_nodes - p.num_top
    info["Z"] = [0.0, info["q"][0] * tmp * tmp + \
            tmp * p.num_top + info["q"][2] * p.num_top * p.num_top]

    with open(p.network_path, "r") as f:
        for line in f:
            if len(line) == 0:
                continue
            items = line.split()
            if len(items) != 2:
                continue
            if int(items[0]) in top_set and int(items[1]) in top_set:
                info["Z"][0] += info["q"][2]
            elif int(items[0]) in top_set or int(items[1]) in top_set:
                info["Z"][0] += 1
            else:
                info["Z"][0] += info["q"][0]
            if not p.is_directed:
                if int(items[1]) in top_set and int(items[0]) in top_set:
                    info["Z"][0] += info["q"][2]
                elif int(items[1]) in top_set or int(items[0]) in top_set:
                    info["Z"][0] += 1
                else:
                    info["Z"][0] += info["q"][0]
    '''
    for e in G.edges():
        if e[0] in top_set and e[1] in top_set:
            info["Z"][0] += info["q"][2]
        elif e[0] in top_set or e[1] in top_set:
            info["Z"][0] += 1
        else:
            info["Z"][0] += info["q"][0]
    '''

    info["total_degree"] = r.get("degree")
    info["num_community"] = num_community
    res["data_path"] = p.res_path
    print info
    return res
Пример #9
0
def split_graph(params, info, pre_res, **kwargs):
    res = params_handler(params, info, pre_res)
    p = ct.obj_dic(params)

    # read top-k
    with io.open(os.path.join(p.data_path, "topk_info.pkl"), "rb") as f:
        topk_params = pickle.load(f)
    top_set = set(v for k, v in topk_params["map"].items())

    #get node lst
    G = gh.load_unweighted_digraph(p.network_path, p.is_directed)
    node_lst = []
    for u in G:
        if u not in top_set:
            node_lst.append(u)
    random.shuffle(node_lst)
    #print node_lst
    #group = {u : idx / p.community_size for idx, u in enumerate(node_lst)}
    tmp = p.community_size_small * p.num_community_small
    group = {}
    for idx, u in enumerate(node_lst):
        if idx < tmp:
            group[u] = idx // p.community_size_small
        else:
            group[u] = p.num_community_small +  (idx - tmp) // p.community_size_large

    #print group
    tmp_files = [FileOutstream(os.path.join(p.tmp_path, "%d" % i)) for i in xrange(p.num_community)]
    for i in xrange(p.num_community):
        with io.open(os.path.join(p.data_path, "%d_info.pkl" % i), "rb") as f:
            sub_params = pickle.load(f)
        for j in sub_params["map"]:
            s = json.dumps((sub_params["embeddings"][j].tolist(),
                sub_params["weights"][j].tolist(),
                sub_params["map"][j],
                sub_params["in_degree"][j],
                sub_params["out_degree"][j]))
            #print s
            u = sub_params["map"][j]
            tmp_files[group[u]].writeline(s)
    del tmp_files
    gc.collect()

    num_ignore = 0
    edge_files = [FileOutstream(os.path.join(p.res_path, "%d_edges" % i)) for i in xrange(p.num_community)]
    topk_edge_file = FileOutstream(os.path.join(p.res_path, "topk_edges"))
    for e in G.edges():
        if e[0] in top_set and e[1] in top_set:
            topk_edge_file.write("%d\t%d\n" % e)
        elif e[0] in top_set:
            edge_files[group[e[1]]].write("%d\t%d\n" % e)
        elif e[1] in top_set or group[e[0]] == group[e[1]]:
            edge_files[group[e[0]]].write("%d\t%d\n" % e)
        else:
            num_ignore += 1
    print "Number of ignored edges: " + str(num_ignore)
    print "Number of edges: " + str(len(G.edges()))
    del edge_files
    del topk_edge_file
    gc.collect()

    for i in xrange(p.num_community):
        embeddings = []
        weights = []
        mapp = {}
        inds = []
        outds = []
        with io.open(os.path.join(p.tmp_path, "%d" % i), "rb") as f:
            for idx, line in enumerate(f):
                line = line.strip()
                if len(line) == 0:
                    continue
                embed, weight, u, ind, outd = json.loads(line)
                embeddings.append(embed)
                weights.append(weight)
                mapp[idx] = u
                outds.append(outd)
                inds.append(ind)

        sub_params = {"embeddings": np.array(embeddings),
                "weights": np.array(weights),
                "map": mapp,
                "in_degree": inds,
                "out_degree": outds}
        #print sub_params
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % i), "wb") as f:
            pickle.dump(sub_params, f)
            #print sub_params
    #res["data_path"] = p.res_path
    res["num_ignore"] = num_ignore
    return res
Пример #10
0
def init(params, info, **kwargs):
    res = params_handler(params, info)
    p = ct.obj_dic(params)

    G = gh.load_unweighted_digraph(p.network_path, p.is_directed)
    info["num_edges"] = len(G.edges())
    # top-k nodes
    q = pq()
    for idx, u in enumerate(G):
        if idx < p.num_top:
            q.put_nowait((G.node[u]["in_degree"], u))
        else:
            tmp = q.get_nowait()
            if tmp[0] <= G.node[u]["in_degree"]:
                q.put_nowait((G.node[u]["in_degree"], u))
            else:
                q.put_nowait(tmp)
    top_lst = []
    top_set = set()
    while not q.empty():
        top_lst.append(q.get_nowait()[1])
        top_set.add(top_lst[-1])
    print "top_lst: " + str(top_lst)

    node_lst = []
    for u in G:
        if u not in top_set:
            node_lst.append(u)

    remain_size = len(node_lst)
    num_community = (remain_size + p.community_bound - 1) // p.community_bound
    num_community_large = remain_size % num_community
    num_community_small = num_community - num_community_large
    community_size_small = remain_size // num_community
    community_size_large = community_size_small + 1

    #print remain_size, num_community, num_community_small, num_community_large, community_size_small, community_size_large

    topk_params = {
        "embeddings": pi.initialize_embeddings(p.num_top, p.dim),
        "weights": pi.initialize_weights(p.num_top, p.dim),
        "in_degree": [G.node[i]["in_degree"] for i in top_lst],
        "out_degree": [G.node[i]["out_degree"] for i in top_lst],
        "map": {i: top_lst[i]
                for i in xrange(len(top_lst))}
    }
    #print topk_params
    with io.open(os.path.join(p.res_path, "topk_info.pkl"), "wb") as f:
        pickle.dump(topk_params, f)

    def deal_subgraph(idx, st, ed):
        sub_params = {
            "embeddings":
            pi.initialize_embeddings(ed - st, p.dim),
            "weights":
            pi.initialize_weights(ed - st, p.dim),
            "in_degree":
            [G.node[node_lst[st + i]]["in_degree"] for i in xrange(ed - st)],
            "out_degree":
            [G.node[node_lst[st + i]]["out_degree"] for i in xrange(ed - st)],
            "map": {i: node_lst[st + i]
                    for i in xrange(ed - st)}
        }
        #print sub_params
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % idx), "wb") as f:
            pickle.dump(sub_params, f)

    for i in xrange(num_community_small):
        deal_subgraph(i, i * community_size_small,
                      (i + 1) * community_size_small)

    tmp = num_community_small * community_size_small
    for i in xrange(num_community_small, num_community):
        deal_subgraph(
            i, tmp + (i - num_community_small) * community_size_large,
            tmp + (i - num_community_small + 1) * community_size_large)

    info["num_community"] = num_community
    info["num_community_small"] = num_community_small
    info["num_community_large"] = num_community_large
    info["community_size_small"] = community_size_small
    info["community_size_large"] = community_size_large

    #print info

    # calculate prob
    def cal_q1():
        K = float(num_community)
        nl = float(community_size_small)
        nr = nl + 1
        n = float(p.num_nodes - p.num_top)
        nh = float(community_size_large)
        Kl = float(num_community_small)
        Kh = float(num_community_large)
        return Kl * nl / n * (nl - 1) / (n - 1) + Kh * nh / n * (nh - 1) / (n -
                                                                            1)

    info["q"] = [cal_q1(), 1.0, float(num_community) if p.q2 is None else p.q2]
    tmp = p.num_nodes - p.num_top
    info["Z"] = [0.0, info["q"][0] * tmp * tmp + \
            2.0 * tmp * p.num_top + info["q"][2] * p.num_top * p.num_top]
    info["num_topk_edges"] = 0
    for e in G.edges():
        if e[0] in top_set and e[1] in top_set:
            info["Z"][0] += info["q"][2]
            info["num_topk_edges"] += 1
        elif e[0] in top_set or e[1] in top_set:
            info["Z"][0] += 1
        else:
            info["Z"][0] += info["q"][0]

    info["total_degree"] = G.graph["degree"]
    info["num_community"] = num_community
    res["data_path"] = p.res_path
    print "Info: ", info["q"], info["Z"]
    #print "End!!"
    return res
Пример #11
0
def split_graph(params, info, pre_res, **kwargs):
    r =redis.Redis(host='localhost',port=6379)
    res = params_handler(params, info, pre_res)
    p = ct.obj_dic(params)

    # read top-k
    with io.open(os.path.join(p.data_path, "topk_info.pkl"), "rb") as f:
        topk_params = pickle.load(f)
    top_set = set(v for k, v in topk_params["map"].items())

    #get node lst
    shufflenode_len = r.llen("shuffle_node")
    for i in range(0,shufflenode_len):
        r.lpop("shuffle_node")
    #由于考虑的是无向图,就先把in当做tmp
    for i in range(0,r.zcard("in1")-p.num_top):
        range_index=random.randint(0,int(r.zcard("in1")-p.num_top-1))
        r.rpush("shuffle_node",int(r.zrange("in1",range_index,range_index)[0]))
        r.zremrangebyrank("in1",range_index,range_index)
    print map(eval,r.lrange("shuffle_node",0,-1))

    #used_memory = r.info()['used_memory']
    #print "used_memory(Byte): %s" % used_memory
    #group = {u : idx / p.community_size for idx, u in enumerate(node_lst)}
    group = {u : idx / p.community_size for idx, u in enumerate(map(eval,r.lrange("shuffle_node",0,-1)))}

    print group
    tmp_files = [FileOutstream(os.path.join(p.tmp_path, "%d" % i)) for i in xrange(p.num_community)]
    for i in xrange(p.num_community):
        with io.open(os.path.join(p.data_path, "%d_info.pkl" % i), "rb") as f:
            sub_params = pickle.load(f)
        for j in sub_params["map"]:
            s = json.dumps((sub_params["embeddings"][j].tolist(),
                sub_params["weights"][j].tolist(),
                sub_params["map"][j]))
            print s
            u = sub_params["map"][j]
            tmp_files[group[u]].writeline(s)
    del tmp_files

    num_ignore = 0
    edge_files = [FileOutstream(os.path.join(p.res_path, "%d_edges" % i)) for i in xrange(p.num_community)]
    '''
    for e in G.edges():
        if e[0] in top_set and e[1] in top_set:
            for idx, f in enumerate(edge_files):
                edge_files[idx].write("%d\t%d\n" % e)
        elif e[0] in top_set:
            edge_files[group[e[1]]].write("%d\t%d\n" % e)
        elif e[1] in top_set or group[e[0]] == group[e[1]]:
            edge_files[group[e[0]]].write("%d\t%d\n" % e)
        else:
            num_ignore += 1
    '''
    with open(p.network_path, "r") as f:
        for line in f:
            if len(line) == 0:
                continue
            items = line.split()
            if len(items) != 2:
                continue
            if int(items[0]) in top_set and int(items[1]) in top_set:
                for idx, f in enumerate(edge_files):
                    edge_files[idx].write("%d\t%d\n" % (int(items[0]),int(items[1])))
            elif int(items[0]) in top_set:
                edge_files[group[int(items[1])]].write("%d\t%d\n" % (int(items[0]),int(items[1])))
            elif int(items[1]) in top_set or group[int(items[0])] == group[int(items[1])]:
                edge_files[group[int(items[0])]].write("%d\t%d\n" % (int(items[0]),int(items[1])))
            else:
                num_ignore += 1
            if not p.is_directed:
                if int(items[1]) in top_set and int(items[0]) in top_set:
                    for idx, f in enumerate(edge_files):
                        edge_files[idx].write("%d\t%d\n" % (int(items[1]),int(items[0])))
                elif int(items[1]) in top_set:
                    edge_files[group[int(items[0])]].write("%d\t%d\n" % (int(items[1]),int(items[0])))
                elif int(items[0]) in top_set or group[int(items[1])] == group[int(items[0])]:
                    edge_files[group[int(items[1])]].write("%d\t%d\n" % (int(items[1]),int(items[0])))
                else:
                    num_ignore += 1
     
    print num_ignore
    del edge_files

    for i in xrange(p.num_community):
        embeddings = []
        weights = []
        mapp = {}
        with io.open(os.path.join(p.tmp_path, "%d" % i), "rb") as f:
            for idx, line in enumerate(f):
                line = line.strip()
                if len(line) == 0:
                    continue
                embed, weight, u = json.loads(line)
                embeddings.append(embed)
                weights.append(weight)
                mapp[idx] = u

        sub_params = {"embeddings": np.array(embeddings),
                "weights": np.array(weights),
                "map": mapp}
        print sub_params
        with io.open(os.path.join(p.res_path, "%d_info.pkl" % i), "wb") as f:
            pickle.dump(sub_params, f)
    #res["data_path"] = p.res_path
    return res