示例#1
0
def parse_args():
    parser = argparse.ArgumentParser(description='GCN')
    register_data_args(parser)
    parser.add_argument("--dropout",
                        type=float,
                        default=0.5,
                        help="dropout probability")
    parser.add_argument("--gpu", type=int, default=-1, help="gpu")
    parser.add_argument("--lr", type=float, default=3e-2, help="learning rate")
    parser.add_argument("--n-epochs",
                        type=int,
                        default=200,
                        help="number of training epochs")
    parser.add_argument("--n-hidden",
                        type=int,
                        default=16,
                        help="number of hidden gcn units")
    parser.add_argument("--n-layers",
                        type=int,
                        default=1,
                        help="number of hidden gcn layers")
    parser.add_argument("--weight-decay",
                        type=float,
                        default=5e-4,
                        help="Weight for L2 loss")
    parser.add_argument("--self-loop",
                        action='store_true',
                        help="graph self-loop (default=False)")
    parser.add_argument("--save-path",
                        type=str,
                        default='./model/gcn.pt',
                        help="path to save model")
    parser.set_defaults(self_loop=False)

    return parser.parse_args()
示例#2
0
文件: train.py 项目: wangdomg/dgl
            dur.append(time.time() - t0)

        acc = evaluate(model, features, labels, val_mask)
        print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                          acc, n_edges / np.mean(dur) / 1000))

    print()
    acc = evaluate(model, features, labels, test_mask)
    print("Test Accuracy {:.4f}".format(acc))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TAGCN')
    register_data_args(parser)
    parser.add_argument("--dropout",
                        type=float,
                        default=0.5,
                        help="dropout probability")
    parser.add_argument("--gpu", type=int, default=-1, help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
    parser.add_argument("--n-epochs",
                        type=int,
                        default=200,
                        help="number of training epochs")
    parser.add_argument("--n-hidden",
                        type=int,
                        default=16,
                        help="number of hidden tagcn units")
    parser.add_argument("--n-layers",
示例#3
0
    def _parse(self):
        # dataset
        register_data_args(self.parser)
        self.parser.add_argument(
            '--batch_size',
            type=int,
            default=32,
            help='batch size for training and validation (default: 32)')
        self.parser.add_argument(
            '--fold_idx',
            type=int,
            default=0,
            help='the index(<10) of fold in 10-fold validation.')
        self.parser.add_argument('--filename',
                                 type=str,
                                 default="",
                                 help='output file')

        # device
        self.parser.add_argument('--disable-cuda',
                                 action='store_true',
                                 help='Disable CUDA')
        self.parser.add_argument('--device',
                                 type=int,
                                 default=0,
                                 help='which gpu device to use (default: 0)')

        # net
        self.parser.add_argument('--num_layers',
                                 type=int,
                                 default=2,
                                 help='number of layers (default: 5)')
        self.parser.add_argument(
            '--num_mlp_layers',
            type=int,
            default=2,
            help='number of MLP layers(default: 2). 1 means linear model.')
        self.parser.add_argument('--hidden_dim',
                                 type=int,
                                 default=512,
                                 help='number of hidden units (default: 64)')

        # graph
        self.parser.add_argument(
            '--graph_pooling_type',
            type=str,
            default="sum",
            choices=["sum", "mean", "max"],
            help='type of graph pooling: sum, mean or max')
        self.parser.add_argument(
            '--neighbor_pooling_type',
            type=str,
            default="sum",
            choices=["sum", "mean", "max"],
            help='type of neighboring pooling: sum, mean or max')
        self.parser.add_argument('--learn_eps',
                                 action="store_true",
                                 help='learn the epsilon weighting')

        # learning
        self.parser.add_argument('--seed',
                                 type=int,
                                 default=0,
                                 help='random seed (default: 0)')
        self.parser.add_argument(
            '--epochs',
            type=int,
            default=20,
            help='number of epochs to train (default: 350)')
        self.parser.add_argument('--lr',
                                 type=float,
                                 default=0.01,
                                 help='learning rate (default: 0.01)')
        self.parser.add_argument('--final_dropout',
                                 type=float,
                                 default=0.5,
                                 help='final layer dropout (default: 0.5)')
        self.parser.add_argument("--self-loop",
                                 action='store_true',
                                 help="graph self-loop (default=False)")
        self.parser.add_argument("--feat_size",
                                 type=int,
                                 default=256,
                                 help="feature size")
        self.parser.add_argument("--n_classes",
                                 type=int,
                                 default=1,
                                 help="world size")

        self.parser.add_argument("--world_size",
                                 type=int,
                                 default=3,
                                 help="world size")

        self.parser.add_argument("--logs_dir",
                                 type=str,
                                 default="./logs",
                                 help="logs dir")

        self.parser.add_argument("--input_graph",
                                 type=str,
                                 default="",
                                 help="input graph")
        self.parser.add_argument("--cached_dir",
                                 type=str,
                                 default="",
                                 help="Cached dir")
        self.parser.add_argument("--local_rank",
                                 default=-1,
                                 type=int,
                                 help="Distributed local rank")
        self.parser.add_argument("--node_rank",
                                 default=-1,
                                 type=int,
                                 help="Distributed node_rank")
        self.parser.add_argument("--nproc_per_node",
                                 default=-1,
                                 type=int,
                                 help="Distributed process per node")
        self.parser.add_argument("--master_addr",
                                 default="localhost",
                                 type=str,
                                 help="Master address")

        # done
        self.args = self.parser.parse_args()
def extract_dataset():
    parser = argparse.ArgumentParser(description='DATA')
    register_data_args(parser)
    args = parser.parse_args()
    dataset_name = [
        'cora', 'citeseer', 'pubmed', 'reddit', 'CoraFull', 'Coauthor_cs',
        'Coauthor_physics', 'AmazonCoBuy_computers', 'AmazonCoBuy_photo'
    ]

    print("Now PATH IS ", os.getcwd())
    for name in dataset_name:
        '''
        if os.path.exists(name):
            print('Folder exists. Skipping ' + name)
            continue
        '''
        if name in ['cora', 'citeseer', 'pubmed', 'reddit']:

            args.dataset = name
            print('args.dataset = ', args.dataset)
            if not os.path.exists(name):
                os.mkdir(name)
            os.chdir(name)

            print("Now PATH IS ", os.getcwd())

            data = load_data(args)
            features = data.features
            labels = data.labels
            graph = data.graph
            edges = graph.edges
            train_mask = data.train_mask
            val_mask = data.val_mask
            test_mask = data.test_mask

            n_nodes = features.shape[0]
            n_edges = data.graph.number_of_edges

            if args.dataset == 'reddit':
                graph, features, labels, train_mask, val_mask, test_mask = cut_graph(
                    graph, n_nodes, n_edges, features, labels, train_mask,
                    val_mask, test_mask, 0.85)

            #edge_x = np.append(edge_x, edge_y, axis=1)

            edges_list = np.array([])
            first_element = True
            if name is not 'reddit':
                for item in edges:

                    if first_element:
                        edges_list = np.array([[item[0], item[1]]])
                        first_element = False
                    else:

                        edges_list = np.append(edges_list,
                                               np.array([[item[0], item[1]]]),
                                               axis=0)

            if name == 'reddit':
                edges = graph.edges()

                edge_x = edges[0].numpy().reshape((-1, 1))
                print(edge_x.shape)
                edge_y = edges[1].numpy().reshape((-1, 1))
                edges_list = np.hstack((edge_x, edge_y))
                print(edges_list.shape, edge_x.shape, edge_y.shape)

            print('features_shape', features.shape)
            print('labels_shape', labels.shape)
            print('edges_shape', edges_list.shape)
            '''
            np.savetxt('edges.txt', edges_list)
            np.savetxt('features.txt', features)
            np.savetxt('labels.txt', labels)

            np.savetxt('train_mask.txt', train_mask)
            np.savetxt('val_mask.txt', val_mask)
            np.savetxt('test_mask.txt', test_mask)
            '''

            np.save('edges.npy', edges_list)
            np.save('features.npy', features)
            np.save('labels.npy', labels)

            np.save('train_mask.npy', train_mask)

            print('Finish writing dataset', name)
            os.chdir('..')
            print('change to ', os.getcwd())

        else:

            if not os.path.exists(name):
                os.mkdir(name)
            os.chdir(name)

            if name == 'CoraFull':
                data = CoraFull()
            elif name == 'Coauthor_cs':
                data = Coauthor('cs')
            elif name == 'Coauthor_physics':
                data = Coauthor('physics')
            elif name == 'AmazonCoBuy_computers':
                data = AmazonCoBuy('computers')
            elif name == 'AmazonCoBuy_photo':
                data = AmazonCoBuy('photo')
            else:
                raise Exception("No such a dataset {}".format(name))

            graph = data.data[0]
            features = torch.FloatTensor(graph.ndata['feat']).numpy()
            labels = torch.LongTensor(graph.ndata['label']).numpy()

            print('dataset ', name)

            features_shape = features.shape
            labels_shape = labels.shape

            n_nodes = features_shape[0]
            edges_u, edges_v = graph.all_edges()

            edges_u = edges_u.numpy()
            edges_v = edges_v.numpy()

            edges_list = np.array([])
            first_element = True
            for idx in range(len(edges_u)):
                if first_element:
                    edges_list = np.array([[edges_u[idx], edges_v[idx]]])
                    first_element = False
                else:
                    edges_list = np.append(edges_list,
                                           np.array(
                                               [[edges_u[idx], edges_v[idx]]]),
                                           axis=0)

            print('features_shape', features_shape)
            print('labels_shape', labels_shape)
            print('edges_shape', edges_list.shape)

            train_mask = []
            for x in range(500):
                train_mask.append(True)
            for x in range(n_nodes - 500):
                train_mask.append(False)
            train_mask = np.array(train_mask)
            '''
            np.savetxt('edges.txt', edges_list)
            np.savetxt('features.txt', features)
            np.savetxt('labels.txt', labels)
            np.savetxt('train_mask.txt', train_mask)
            '''

            np.save('edges.npy', edges_list)
            np.save('features.npy', features)
            np.save('labels.npy', labels)
            np.save('train_mask.npy', train_mask)

            print('Finish writing dataset', name)
            os.chdir('..')
            print('change to ', os.getcwd())
def main():
    parser = argparse.ArgumentParser(description='GCN')
    register_data_args(parser)    
    parser.add_argument("--iter_per_site", type=int, default=5)
    parser.add_argument("--num_subnet", type=int, default=2,
                        help="number of sub networks")
    parser.add_argument("--dropout", type=float, default=0.5,
                        help="dropout probability")
    parser.add_argument("--lr", type=float, default=0.01,
                        help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=20,
                        help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=16,
                        help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
                        help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
                        help="Weight for L2 loss")
    parser.add_argument("--use_layernorm", type=bool, default=False,
                        help="Whether use layernorm (default=False)")
    parser.add_argument('--dist-backend', type=str, default='nccl', metavar='S',
                        help='backend type for distributed PyTorch')
    parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:9971', metavar='S',
                        help='master ip for distributed PyTorch')
    parser.add_argument('--rank', type=int, default=0, metavar='R',
                        help='rank for distributed PyTorch')
    parser.add_argument('--cuda-id', type=int, default=0, metavar='N',
                        help='cuda index, if the instance has multiple GPUs.')
    parser.add_argument("--batch-size", type=int, default=20,
                        help="batch size")
    parser.add_argument("--psize", type=int, default=1500,
                        help="partition number")
    parser.add_argument("--test-batch-size", type=int, default=1000,
                        help="test batch size")
    parser.add_argument("--rnd-seed", type=int, default=3,
                        help="number of epoch of doing inference on validation")
    parser.add_argument("--use-pp", action='store_true',
                        help="whether to use precomputation")
    parser.add_argument("--normalize", action='store_true',
                        help="whether to use normalized feature")
    parser.add_argument("--save_results", action='store_true')
    parser.add_argument('--n-heads', type=int, default=4)
    parser.add_argument("--exp_name", type=str, default='distributed_gnn_ist')
    args = parser.parse_args()

    assert (args.n_hidden % args.num_subnet) == 0

    # set all the random seeds
    print('Setting seeds', flush=True)
    torch.manual_seed(args.rnd_seed)
    np.random.seed(args.rnd_seed)
    random.seed(args.rnd_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # set the proper GPU
    assert args.cuda_id < torch.cuda.device_count()
    device = torch.device(f'cuda:{args.cuda_id}')

    # initialize the distributed process group
    print(f'{args.rank} initializing process', flush=True)
    dist.init_process_group(
            backend=args.dist_backend, init_method=args.dist_url, rank=args.rank,
            world_size=args.num_subnet)
    print(f'Process spawned: {args.rank} --> {device}', flush=True)

    # get the data and setup the dataset
    dataset = get_data(args, device)
    (g, cluster_iterator, train_mask, val_mask, test_mask, labels,
            train_nid, in_feats, n_classes, n_edges) = dataset

    # get the main model
    ist_model = DistributedGNNWrapper(args, g, in_feats, n_classes, device)
    print(f'{args.rank}: start initial dispatch', flush=True)
    ist_model.ini_sync_dispatch_model()
    print(f'{args.rank}: finish initial dispatch', flush=True)
    train(
            ist_model, args, g, cluster_iterator, labels, train_mask, val_mask,
            test_mask, train_nid, device)
示例#6
0
def main():
    parser = argparse.ArgumentParser(description='baseline')
    register_data_args(parser)
    parser.add_argument("--mode",
                        type=str,
                        default='A',
                        choices=['A', 'AX', 'X'],
                        help="dropout probability")
    parser.add_argument("--seed",
                        type=int,
                        default=-1,
                        help="random seed, -1 means dont fix seed")
    parser.add_argument(
        "--emb-method",
        type=str,
        default='DeepWalk',
        help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec")
    parser.add_argument("--ad-method",
                        type=str,
                        default='OCSVM',
                        help="embedding methods: PCA,OCSVM,IF,AE")
    args = parser.parse_args()

    if args.seed != -1:
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    logging.basicConfig(
        filename="./log/baseline.log",
        filemode="a",
        format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",
        level=logging.INFO)
    logger = logging.getLogger('baseline')

    datadict = emb_dataloader(args)

    if args.mode == 'X':
        data = datadict['features']
        #print('X shape',data.shape)
    else:
        t0 = time.time()
        embeddings = embedding(args, datadict)
        dur1 = time.time() - t0

        if args.mode == 'A':
            data = embeddings
            #print('A shape',data.shape)
        if args.mode == 'AX':
            data = np.concatenate((embeddings, datadict['features']), axis=1)
            #print('AX shape',data.shape)

    logger.debug(f'data shape: {data.shape}')

    if args.ad_method == 'OCSVM':
        clf = OCSVM(contamination=0.1)
    if args.ad_method == 'IF':
        clf = IForest(n_estimators=100,
                      contamination=0.1,
                      n_jobs=-1,
                      behaviour="new")
    if args.ad_method == 'PCA':
        clf = PCA(contamination=0.1)
    if args.ad_method == 'AE':
        clf = AutoEncoder(contamination=0.1)

    t1 = time.time()
    clf.fit(data[datadict['train_mask']])
    dur2 = time.time() - t1

    print('traininig time:', dur1 + dur2)

    logger.info('\n')
    logger.info('\n')
    logger.info(
        f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}'
    )
    logger.info('-------------Evaluating Validation Results--------------')

    t2 = time.time()
    y_pred_val = clf.predict(data[datadict['val_mask']])
    y_score_val = clf.decision_function(data[datadict['val_mask']])
    auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict,
                                                            y_pred_val,
                                                            y_score_val,
                                                            val=True)
    dur3 = time.time() - t2
    print('infer time:', dur3)

    logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
    logger.info(
        f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}'
    )

    logger.info('-------------Evaluating Test Results--------------')
    y_pred_test = clf.predict(data[datadict['test_mask']])
    y_score_test = clf.decision_function(data[datadict['test_mask']])
    auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict,
                                                            y_pred_test,
                                                            y_score_test,
                                                            val=False)
    logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}')
    logger.info(
        f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}'
    )