Пример #1
0
    def gat_layer(self, input, adj, genPath=False, eluF=True):
        N = input.size()[0]
        edge = adj._indices()
        h = torch.mm(input, self.W)
        h = h+self.bias                # h: N x out

        # Self-attention on the nodes - Shared attention mechanism
        edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t()     # edge_h: 2*D x E
        edge_att = self.a.mm(edge_h).squeeze()
        edge_e_a = self.leakyrelu(edge_att)     # edge_e_a: E   attetion score for each edge
        if genPath:
            with torch.no_grad():
                edge_weight = edge_e_a
                p_a_e = edge_weight - scatter_max(edge_weight, edge[0,:], dim=0, dim_size=N)[0][edge[0,:]]
                p_a_e = p_a_e.exp()
                p_a_e = p_a_e / (scatter_add(p_a_e, edge[0,:], dim=0, dim_size=N)[edge[0,:]]\
                                    +torch.Tensor([9e-15]).cuda())
                
                scisp = convert.to_scipy_sparse_matrix(edge, p_a_e, N)
                scipy.sparse.save_npz(os.path.join(genPath, 'attmat_{:s}.npz'.format(self.layerN)), scisp)

        edge_e = torch.exp(edge_e_a - torch.max(edge_e_a))                  # edge_e: E
        e_rowsum = spmm(edge, edge_e, N, torch.ones(size=(N,1)).cuda())     # e_rowsum: N x 1
        edge_e = self.dropout(edge_e)       # add dropout improve from 82.4 to 83.8
        # edge_e: E
        
        h_prime = spmm(edge, edge_e, N, h)
        h_prime = h_prime.div(e_rowsum+torch.Tensor([9e-15]).cuda())        # h_prime: N x out
        
        if self.concat and eluF:
            return F.elu(h_prime)
        else:
            return h_prime
    def forward(self, nodes, adjs):
        edge, _ = dense_to_sparse(adjs)
        x = self.sage1(nodes, edge)
        s = self.sage2(nodes, edge)
        s = torch.reshape(s, (1, nodes.size(0), 128))

        x = torch.reshape(x, (1, nodes.size(0), 128))

        adjs = torch.reshape(adjs, (1, nodes.size(0), nodes.size(0)))

        x, edge, link_loss1, ent_loss1 = dense_diff_pool(x, adjs, s)

        x = torch.reshape(x, (128, 128))

        edge = torch.reshape(edge, (128, 128))
        #for i in range(edge.size(0)):
        #    edge[i,:] = torch.where(edge[i,:] == torch.max(edge[i,:]),torch.ones(1,128).cuda(), torch.zeros(1,128).cuda())

        edge_out = edge
        edge, _ = dense_to_sparse(edge)
        #nodes_out = x
        x = self.sage3(x, edge)
        nodes_out = torch.tanh(x)

        #x = self.sage4(nodes_out, edge)

        edge = torch.Tensor(
            convert.to_scipy_sparse_matrix(edge).todense()).cuda()
        edge = torch.reshape(edge, (1, 128, 128))

        x = torch.reshape(x, (1, 128, 2))

        s = torch.ones(1, 128, 1).cuda()
        x, edge, link_loss2, ent_loss2 = dense_diff_pool(x, edge, s)

        x = x.reshape(-1)
        link_loss = link_loss1 + link_loss2
        ent_loss = ent_loss1 + ent_loss2
        #print(x.shape, edge.shape)
        #print(asd)
        """ x_out = torch.reshape(x, (128,2))
        edge = torch.reshape(edge, (128,128))
        for i in range(edge.size(0)):
            edge[i,:] = torch.where(edge[i,:] == torch.max(edge[i,:]),torch.ones(1,128).cuda(), torch.zeros(1,128).cuda())
        edge, _ = dense_to_sparse(edge)
        x = self.sage3(x_out, edge)
        x = torch.reshape(x, (128,)) """

        return x, link_loss, ent_loss, nodes_out, edge_out
Пример #3
0
def load_data(args, datapath):
    if args.dataset in ['arxiv'] and args.task == 'lp':
        data = {}
        dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset),
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index)
        induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index)
        induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index)
        neg_edges_train = negative_sampling(induced_edges_train)
        neg_edges_valid = negative_sampling(induced_edges_valid)
        neg_edges_test = negative_sampling(induced_edges_test)
        data['adj_train'] = to_scipy_sparse_matrix(
            dataset[0].edge_index).tocsr()
        data['features'] = dataset[0].x
        data['train_edges'], data[
            'train_edges_false'] = induced_edges_train, neg_edges_train
        data['val_edges'], data[
            'val_edges_false'] = induced_edges_valid, neg_edges_valid
        data['test_edges'], data[
            'test_edges_false'] = induced_edges_test, neg_edges_test
    elif args.task == 'nc':
        data = load_data_nc(args.dataset, args.use_feats, datapath,
                            args.split_seed)
    else:
        data = load_data_lp(args.dataset, args.use_feats, datapath)
        adj = data['adj_train']
        if args.task == 'lp':
            adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges(
                adj, args.val_prop, args.test_prop, args.split_seed)
            data['adj_train'] = adj_train
            data['train_edges'], data[
                'train_edges_false'] = train_edges, train_edges_false
            data['val_edges'], data[
                'val_edges_false'] = val_edges, val_edges_false
            data['test_edges'], data[
                'test_edges_false'] = test_edges, test_edges_false
    data['adj_train_norm'], data['features'] = process(data['adj_train'],
                                                       data['features'],
                                                       args.normalize_adj,
                                                       args.normalize_feats)
    if args.dataset == 'airport':
        data['features'] = augment(data['adj_train'], data['features'])
    return data
Пример #4
0
def load_data_nc(dataset, use_feats, data_path, split_seed):
    if dataset in ['cora', 'pubmed']:
        adj, features, labels, idx_train, idx_val, idx_test = load_citation_data(
            dataset, use_feats, data_path, split_seed)
    elif dataset == 'arxiv':
        dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        idx_train, idx_val, idx_test = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        adj = to_scipy_sparse_matrix(dataset[0].edge_index).tocsr()
        features = dataset[0].x
        labels = dataset[0].y
    else:
        if dataset == 'disease_nc':
            adj, features, labels = load_synthetic_data(
                dataset, use_feats, data_path)
            val_prop, test_prop = 0.10, 0.60
        elif dataset == 'airport':
            adj, features, labels = load_data_airport(dataset,
                                                      data_path,
                                                      return_label=True)
            val_prop, test_prop = 0.15, 0.15
        else:
            raise FileNotFoundError(
                'Dataset {} is not supported.'.format(dataset))
        idx_val, idx_test, idx_train = split_data(labels,
                                                  val_prop,
                                                  test_prop,
                                                  seed=split_seed)

    labels = torch.LongTensor(labels)
    data = {
        'adj_train': adj,
        'features': features,
        'labels': labels,
        'idx_train': idx_train,
        'idx_val': idx_val,
        'idx_test': idx_test
    }
    return data
Пример #5
0
def main():
    parser = ArgumentParser(description="GraphZoom")
    parser.add_argument("-d", "--dataset", type=str, default="arxiv", \
            help="input dataset")
    parser.add_argument("-o", "--coarse", type=str, default="lamg", \
            help="choose either simple_coarse or lamg_coarse, [simple, lamg]")
    parser.add_argument("-c", "--mcr_dir", type=str, default="/opt/matlab/R2018A/", \
            help="directory of matlab compiler runtime (only required by lamg_coarsen)")
    parser.add_argument("-s", "--search_ratio", type=int, default=12, \
            help="control the search space in graph fusion process (only required by lamg_coarsen)")
    parser.add_argument("-r", "--reduce_ratio", type=int, default=2, \
            help="control graph coarsening levels (only required by lamg_coarsen)")
    parser.add_argument("-v", "--level", type=int, default=1, \
            help="number of coarsening levels (only required by simple_coarsen)")
    parser.add_argument("-n", "--num_neighs", type=int, default=2, \
            help="control k-nearest neighbors in graph fusion process")
    parser.add_argument("-l", "--lda", type=float, default=0.1, \
            help="control self loop in adjacency matrix")
    parser.add_argument("-e", "--embed_path", type=str, default="./embed_results/", \
            help="path of embedding result")
    parser.add_argument("-m", "--embed_method", type=str, default="node2vec", \
            help="graph embedding method")
    parser.add_argument("-f", "--fusion", default=True, action="store_false", \
            help="whether use graph fusion")
    parser.add_argument("-p", "--power", default=False, action="store_true", \
            help="Strong power of graph filter, set True to enhance filter power")
    parser.add_argument("-g", "--sage_model", type=str, default="mean", \
            help="aggregation function in graphsage")
    parser.add_argument("-w", "--sage_weighted", default=True, action="store_false", \
            help="whether consider weighted reduced graph")

    args = parser.parse_args()

    dataset = args.dataset
    feature_path = "dataset/{}/{}-feats.npy".format(dataset, dataset)
    fusion_input_path = "dataset/{}/{}.mtx".format(dataset, dataset)
    reduce_results = "./reduction_results/"
    mapping_path = "{}Mapping.mtx".format(reduce_results)

    d = PygNodePropPredDataset(name=f"ogbn-{dataset}")

    os.makedirs(reduce_results, exist_ok=True)
    os.makedirs(f"dataset/{dataset}", exist_ok=True)

    if args.fusion:
        coarsen_input_path = "dataset/{}/fused_{}.mtx".format(dataset, dataset)
    else:
        coarsen_input_path = "dataset/{}/{}.mtx".format(dataset, dataset)

######Load Data######
    print("%%%%%% Loading Graph Data %%%%%%")
    lp_index, lp_weight = get_laplacian(to_undirected(d[0].edge_index, d[0].num_nodes))
    laplacian = to_scipy_sparse_matrix(lp_index, lp_weight)
    if args.coarse == "lamg":
        if os.path.exists(fusion_input_path):
            print("Laplacian matrix in mtx already exists.")
        else:
            print("Saving laplacian matrix in mtx...")
            file = open(fusion_input_path, "wb")
            mmwrite(fusion_input_path, laplacian)
            file.close()

    ## whether node features are required
    if args.fusion:
        feature = d[0].x.numpy()

######Graph Fusion######
    if args.fusion:
        print("%%%%%% Starting Graph Fusion %%%%%%")
        fusion_start = time.process_time()
        laplacian    = graph_fusion(laplacian, feature, args.num_neighs, args.mcr_dir, args.coarse,\
                       fusion_input_path, args.search_ratio, reduce_results, mapping_path, dataset)
        fusion_time  = time.process_time() - fusion_start

######Graph Reduction######
    print("%%%%%% Starting Graph Reduction %%%%%%")
    reduce_start = time.process_time()

    if args.coarse == "simple":
        G, projections, laplacians, level = sim_coarse(laplacian, args.level)
        reduce_time = time.process_time() - reduce_start

    elif args.coarse == "lamg":
        os.system('./run_coarsening.sh {} {} {} n {}'.format(args.mcr_dir, \
                coarsen_input_path, args.reduce_ratio, reduce_results))
        reduce_time = read_time("{}CPUtime.txt".format(reduce_results))
        G = mtx2graph("{}Gs.mtx".format(reduce_results))
        level = read_levels("{}NumLevels.txt".format(reduce_results))
        projections, laplacians = construct_proj_laplacian(laplacian, level, reduce_results)

    else:
        raise NotImplementedError

    edge_index = torch.tensor(list(G.edges)).t().contiguous().view(2, -1)
    edge_index = to_undirected(edge_index, len(G.nodes()))


######Embed Reduced Graph######
    print("%%%%%% Starting Graph Embedding %%%%%%")
    if args.embed_method == "node2vec":
        embed_start = time.process_time()
        embeddings  = node2vec(edge_index)
    else:
        raise NotImplementedError

    embed_time = time.process_time() - embed_start


######Refinement######
    print("%%%%%% Starting Graph Refinement %%%%%%")
    refine_start = time.process_time()
    embeddings   = refinement(level, projections, laplacians, embeddings, args.lda, args.power)
    refine_time  = time.process_time() - refine_start


######Save Embeddings######
    os.makedirs(args.embed_path, exist_ok=True)
    np.save(args.embed_path + "embeddings.npy", embeddings)

######Report timing information######
    print("%%%%%% CPU time %%%%%%")
    if args.fusion:
        total_time = fusion_time + reduce_time + embed_time + refine_time
        print(f"Graph Fusion     Time: {fusion_time:.3f}")
    else:
        total_time = reduce_time + embed_time + refine_time
        print("Graph Fusion     Time: 0")
    print(f"Graph Reduction  Time: {reduce_time:.3f}")
    print(f"Graph Embedding  Time: {embed_time:.3f}")
    print(f"Graph Refinement Time: {refine_time:.3f}")
    print(f"Total Time = Fusion_time + Reduction_time + Embedding_time + Refinement_time = {total_time:.3f}")
    def forward(self, inputs):
        x = F.relu(self.conv1(inputs))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv4(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv5(x))
        x = F.max_pool2d(x, 2, 2)
        #print(x)
        org = torch.reshape(x, (256, 256))

        edge = torch.Tensor(ori_adjacen).long().t().contiguous().cuda()

        x = self.sage1(org, edge)
        s = self.sage2(org, edge)
        s = torch.reshape(s, (1, 256, 128))

        x = torch.reshape(x, (1, 256, 128))

        edge = torch.Tensor(
            convert.to_scipy_sparse_matrix(edge).todense()).cuda()
        edge = torch.reshape(edge, (1, 256, 256))

        x, edge, link_loss1, ent_loss1 = dense_diff_pool(x, edge, s)
        #x = torch.tanh(x)

        x = torch.reshape(x, (128, 128))

        edge = torch.reshape(edge, (128, 128))
        edge_out = edge
        for i in range(edge_out.size(0)):
            edge_out[i, :] = torch.where(
                edge_out[i, :] == torch.max(edge_out[i, :]),
                torch.ones(1, 128).cuda(),
                torch.zeros(1, 128).cuda())

        edge, _ = dense_to_sparse(edge)
        #nodes_out = x
        x = self.sage3(x, edge)
        nodes_out = torch.tanh(x)
        x = nodes_out
        #x = self.sage4(nodes_out, edge)
        edge_dense = edge

        edge = torch.Tensor(
            convert.to_scipy_sparse_matrix(edge).todense()).cuda()
        #print(edge)
        #print(asd)
        edge = torch.reshape(edge, (1, 128, 128))

        x = torch.reshape(x, (1, 128, 2))

        s = torch.ones(1, 128, 1).cuda()
        x, edge, link_loss2, ent_loss2 = dense_diff_pool(x, edge, s)

        x = x.reshape(-1)

        link_loss = link_loss1 + link_loss2
        ent_loss = ent_loss1 + ent_loss2

        return x, link_loss, ent_loss, nodes_out, edge_out, edge_dense
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT')
    parser.add_argument(
        '--raw-text-path',
        type=str,
        required=True,
        help="Path of raw text (.txt file, each raw correspond to a node)")
    parser.add_argument(
        '--vectorizer-config-path',
        type=str,
        required=True,
        help="a path to a json file that specify the tfidf hyper-paramters")
    parser.add_argument('--data-root-dir', type=str, default="./dataset")
    parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt")
    parser.add_argument('--dataset', type=str, default="ogbn-arxiv")
    parser.add_argument('--max-deg', type=int, default=1000)
    args = parser.parse_args()
    print(args)

    # Change args.save_data_dir to args.save_data_dir/args.dataset
    save_data_dir = os.path.join(args.xrt_data_dir, args.dataset)
    dataset = PygNodePropPredDataset(name=args.dataset,
                                     root=args.data_root_dir)
    data = dataset[0]
    edge_index = data.edge_index

    # Make sure edge_index is undirected!!!
    if not is_undirected(edge_index):
        edge_index = to_undirected(edge_index)
    # Filtering nodes whose number of edges >= max_degree
    Degree = degree(edge_index[0])
    Filtered_idx = torch.where(Degree < args.max_deg)[0]
    print('Number of original nodes:{}'.format(data.x.shape[0]))
    print('Number of filtered nodes:{}'.format(len(Filtered_idx)))

    # # Construct and save label matrix (adjacencey matrix) Y.
    Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index))
    Y_csr_trn = Y_csr_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn)
    smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all)
    print("Saved Y.trn.npz and Y.all.npz")

    # Apply the same filtering for raw text
    with open(args.raw_text_path, "r") as fin:
        node_text_list = fin.readlines()
    print("|node_text_list={}".format(len(node_text_list)))
    count = 0
    with open(f"{save_data_dir}/X.trn.txt", "w") as fout:
        for cur_idx, line in enumerate(node_text_list):
            if Filtered_idx[count].item() == cur_idx:
                fout.writelines(line)
                count += 1
    assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format(
        count, len(Filtered_idx))
    print("Saved X.trn.txt")

    # Apply the same filtering for tfidf features
    vectorizer_config = Vectorizer.load_config_from_args(
        args)  # using args.vectorizer_config_path
    preprocessor = Preprocessor.train(node_text_list,
                                      vectorizer_config,
                                      dtype=np.float32)
    preprocessor.save(f"{save_data_dir}/tfidf-model")
    X_tfidf_all = preprocessor.predict(node_text_list)
    X_tfidf_trn = X_tfidf_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all)
    smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn)
    print("Saved X.trn.npz and X.all.npz")