Пример #1
0
    def __call__(self):
        np.random.seed(os.getpid())
        if self.neg_sample_type == "outdegree":
            outdegree = self.graph.outdegree()
            distribution = 1. * outdegree / outdegree.sum()
            alias, events = alias_sample_build_table(distribution)
        max_len = int(self.batch_size * self.walk_len *
                      ((1 + self.win_size) - 0.3))
        for walks in self.walk_generator():
            src, pos = [], []
            for walk in walks:
                s, p = skip_gram_gen_pair(walk, self.win_size)
                src.extend(s), pos.extend(p)
            src = np.array(src, dtype=np.int64),
            pos = np.array(pos, dtype=np.int64)
            src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1])

            if src.shape[0] == 0:
                continue
            neg_sample_size = [len(pos), self.neg_num, 1]
            if self.neg_sample_type == "average":
                negs = self.graph.sample_nodes(neg_sample_size)
            elif self.neg_sample_type == "outdegree":
                negs = alias_sample(neg_sample_size, alias, events)
            # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
            dst = np.concatenate([pos, negs], 1)
            src_feat = np.concatenate([src, self.node_feat[src[:, :, 0]]], -1)
            dst_feat = np.concatenate([dst, self.node_feat[dst[:, :, 0]]], -1)
            src_feat, dst_feat = np.expand_dims(src_feat, -1), np.expand_dims(
                dst_feat, -1)
            yield src_feat[:max_len], dst_feat[:max_len]
Пример #2
0
def dump_graph(args):
    if not os.path.exists(args.outpath):
        os.makedirs(args.outpath)
    neg_samples = []
    str2id = dict()
    term_file = io.open(os.path.join(args.outpath, "terms.txt"),
                        "w",
                        encoding=args.encoding)
    terms = []
    count = 0
    item_distribution = []

    with io.open(args.inpath, encoding=args.encoding) as f:
        edges = []
        for idx, line in enumerate(f):
            if idx % 100000 == 0:
                log.info("%s readed %s lines" % (args.inpath, idx))
            slots = []
            for col_idx, col in enumerate(line.strip("\n").split("\t")):
                s = col[:args.max_seqlen]
                if s not in str2id:
                    str2id[s] = count
                    count += 1
                    term_file.write(str(col_idx) + "\t" + col + "\n")
                    item_distribution.append(0)

                slots.append(str2id[s])

            src = slots[0]
            dst = slots[1]
            neg_samples.append(slots[2:])
            edges.append((src, dst))
            edges.append((dst, src))
            item_distribution[dst] += 1

        term_file.close()
        edges = np.array(edges, dtype="int64")
        num_nodes = len(str2id)
        str2id.clear()
    log.info("building graph...")
    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
    indegree = graph.indegree()
    graph.indegree()
    graph.outdegree()
    graph.dump(args.outpath)

    # dump alias sample table
    item_distribution = np.array(item_distribution)
    item_distribution = np.sqrt(item_distribution)
    distribution = 1. * item_distribution / item_distribution.sum()
    alias, events = alias_sample_build_table(distribution)
    np.save(os.path.join(args.outpath, "alias.npy"), alias)
    np.save(os.path.join(args.outpath, "events.npy"), events)
    np.save(os.path.join(args.outpath, "neg_samples.npy"),
            np.array(neg_samples))
    log.info("End Build Graph")
Пример #3
0
 def test_resut(self):
     """test_result
     """
     size = [450000]
     num = 10
     probs = np.arange(1, num).astype(np.float64)
     probs /= np.sum(probs)
     alias, events = alias_sample_build_table(probs)
     ret = alias_sample(size, alias, events)
     cnt = Counter(ret)
     sort_cnt_keys = [x[1] for x in sorted(zip(cnt.values(), cnt.keys()))]
     self.assertEqual(sort_cnt_keys, np.arange(0, num - 1).tolist())
Пример #4
0
def graph_alias_sample_table(graph, edge_weight_name):
    """Build alias sample table for weighted deepwalk.
    Args:
        graph: The input graph
        edge_weight_name: The name of edge weight in edge_feat.

    Return:
        Alias sample tables for each nodes.
    """
    edge_weight = graph.edge_feat[edge_weight_name]
    _, eids_array = graph.successor(return_eids=True)
    alias_array, events_array = [], []
    for eids in eids_array:
        probs = edge_weight[eids]
        probs /= np.sum(probs)
        alias, events = graph_kernel.alias_sample_build_table(probs)
        alias_array.append(alias), events_array.append(events)
    alias_array, events_array = np.array(alias_array), np.array(events_array)
    return alias_array, events_array
Пример #5
0
    def test_speed(self):
        """test_speed
        """

        num = 1000
        size = [10240, 1, 5]
        probs = np.random.uniform(0.0, 1.0, [num])
        probs /= np.sum(probs)

        start = time.time()
        alias, events = alias_sample_build_table(probs)
        for i in range(100):
            alias_sample(size, alias, events)
        alias_sample_time = time.time() - start

        start = time.time()
        for i in range(100):
            np.random.choice(num, size, p=probs)
        np_sample_time = time.time() - start
        self.assertTrue(alias_sample_time < np_sample_time)
Пример #6
0
def dump_graph(config):
    if not os.path.exists(config.graph_work_path):
        os.makedirs(config.graph_work_path)
    str2id = dict()
    term_file = io.open(os.path.join(config.graph_work_path, "terms.txt"),
                        "w",
                        encoding=config.encoding)
    terms = []
    item_distribution = []

    edges = load_graph(config, str2id, term_file, terms, item_distribution)
    #load_train_data(config, str2id, term_file, terms, item_distribution)
    if config.task == "link_predict":
        load_link_predict_train_data(config, str2id, term_file, terms,
                                     item_distribution)
    elif config.task == "node_classification":
        load_node_classification_train_data(config, str2id, term_file, terms,
                                            item_distribution)
    else:
        raise ValueError

    term_file.close()
    num_nodes = len(str2id)
    str2id.clear()

    log.info("building graph...")
    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
    indegree = graph.indegree()
    graph.indegree()
    graph.outdegree()
    graph.dump(config.graph_work_path)

    # dump alias sample table
    item_distribution = np.array(item_distribution)
    item_distribution = np.sqrt(item_distribution)
    distribution = 1. * item_distribution / item_distribution.sum()
    alias, events = alias_sample_build_table(distribution)
    np.save(os.path.join(config.graph_work_path, "alias.npy"), alias)
    np.save(os.path.join(config.graph_work_path, "events.npy"), events)
    log.info("End Build Graph")
Пример #7
0
    def __call__(self):
        np.random.seed(os.getpid())
        if self.neg_sample_type == "outdegree":
            outdegree = self.graph.outdegree()
            distribution = 1. * outdegree / outdegree.sum()
            alias, events = alias_sample_build_table(distribution)
        max_len = int(self.batch_size * self.walk_len *
                      ((1 + self.win_size) - 0.3))
        for walks in self.walk_generator():
            try:
                src_list, pos_list = [], []
                for walk in walks:
                    s, p = skip_gram_gen_pair(walk, self.win_size)
                    src_list.append(s[:max_len]), pos_list.append(p[:max_len])
                src = [s for x in src_list for s in x]
                pos = [s for x in pos_list for s in x]
                src = np.array(src, dtype=np.int64),
                pos = np.array(pos, dtype=np.int64)
                src, pos = np.reshape(src,
                                      [-1, 1, 1]), np.reshape(pos, [-1, 1, 1])

                neg_sample_size = [len(pos), self.neg_num, 1]
                if src.shape[0] == 0:
                    continue
                if self.neg_sample_type == "average":
                    negs = np.random.randint(low=0,
                                             high=self.graph.num_nodes,
                                             size=neg_sample_size)
                elif self.neg_sample_type == "outdegree":
                    negs = alias_sample(neg_sample_size, alias, events)
                elif self.neg_sample_type == "inbatch":
                    pass
                dst = np.concatenate([pos, negs], 1)
                # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
                yield src[:max_len], dst[:max_len]
            except Exception as e:
                log.exception(e)
Пример #8
0
    def normlization_layer_weight(self):
        """
        Normlation the distance between nodes, weight[1, 2, ....N] = distance[1, 2, ......N] / sum(distance)
        """
        for sd_keys, layer_weight in self.distance.items():
            src, dist = sd_keys
            layers, weights = layer_weight.keys(), layer_weight.values()
            for layer, weight in zip(layers, weights):
                if layer not in self.layer_distance:
                    self.layer_distance[layer] = {}
                if layer not in self.layer_message:
                    self.layer_message[layer] = {}
                self.layer_distance[layer][src, dist] = weight

                if src not in self.layer_message[layer]:
                    self.layer_message[layer][src] = []
                if dist not in self.layer_message[layer]:
                    self.layer_message[layer][dist] = []
                self.layer_message[layer][src].append(dist)
                self.layer_message[layer][dist].append(src)

        # normalization the layer weight  
        for i in range(0, self.depth):
            layer_weight = 0.0
            layer_count = 0
            if i not in self.layer_norm_distance:
                self.layer_norm_distance[i] = {}
            if i not in self.sample_alias:
                self.sample_alias[i] = {}
            if i not in self.sample_events:
                self.sample_events[i] = {}
            if i not in self.layer_message:
                continue
            for node in self.nodes:
                if node not in self.layer_message[i]:
                    continue
                nbhs = self.layer_message[i][node]
                weights = []
                sum_weight = 0.0
                for dist in nbhs:
                    if (node, dist) in self.layer_distance[i]:
                        weight = self.layer_distance[i][node, dist]
                    else:
                        weight = self.layer_distance[i][dist, node]
                    weight = np.exp(-float(weight))
                    weights.append(weight)
                # norm the weight 
                sum_weight = sum(weights)
                if sum_weight == 0.0:
                    sum_weight = 1.0
                weight_list = [weight / sum_weight for weight in weights]
                self.layer_norm_distance[i][node] = weight_list
                alias, events = alias_sample_build_table(np.array(weight_list))
                self.sample_alias[i][node] = alias
                self.sample_events[i][node] = events
                layer_weight += 1.0
                #layer_weight += sum(weight_list)
                layer_count += len(weights)
            layer_avg_weight = layer_weight / (1.0 * layer_count)

            self.layer_node_weight_count[i] = dict()
            for node in self.nodes:
                if node not in self.layer_norm_distance[i]:
                    continue
                weight_list = self.layer_norm_distance[i][node]
                node_cnt = 0
                for weight in weight_list:
                    if weight > layer_avg_weight:
                        node_cnt += 1
                self.layer_node_weight_count[i][node] = node_cnt