def parse_args(): parser = argparse.ArgumentParser(description='GCN') register_data_args(parser) parser.add_argument("--dropout", type=float, default=0.5, help="dropout probability") parser.add_argument("--gpu", type=int, default=-1, help="gpu") parser.add_argument("--lr", type=float, default=3e-2, help="learning rate") parser.add_argument("--n-epochs", type=int, default=200, help="number of training epochs") parser.add_argument("--n-hidden", type=int, default=16, help="number of hidden gcn units") parser.add_argument("--n-layers", type=int, default=1, help="number of hidden gcn layers") parser.add_argument("--weight-decay", type=float, default=5e-4, help="Weight for L2 loss") parser.add_argument("--self-loop", action='store_true', help="graph self-loop (default=False)") parser.add_argument("--save-path", type=str, default='./model/gcn.pt', help="path to save model") parser.set_defaults(self_loop=False) return parser.parse_args()
dur.append(time.time() - t0) acc = evaluate(model, features, labels, val_mask) print( "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(), acc, n_edges / np.mean(dur) / 1000)) print() acc = evaluate(model, features, labels, test_mask) print("Test Accuracy {:.4f}".format(acc)) if __name__ == '__main__': parser = argparse.ArgumentParser(description='TAGCN') register_data_args(parser) parser.add_argument("--dropout", type=float, default=0.5, help="dropout probability") parser.add_argument("--gpu", type=int, default=-1, help="gpu") parser.add_argument("--lr", type=float, default=1e-2, help="learning rate") parser.add_argument("--n-epochs", type=int, default=200, help="number of training epochs") parser.add_argument("--n-hidden", type=int, default=16, help="number of hidden tagcn units") parser.add_argument("--n-layers",
def _parse(self): # dataset register_data_args(self.parser) self.parser.add_argument( '--batch_size', type=int, default=32, help='batch size for training and validation (default: 32)') self.parser.add_argument( '--fold_idx', type=int, default=0, help='the index(<10) of fold in 10-fold validation.') self.parser.add_argument('--filename', type=str, default="", help='output file') # device self.parser.add_argument('--disable-cuda', action='store_true', help='Disable CUDA') self.parser.add_argument('--device', type=int, default=0, help='which gpu device to use (default: 0)') # net self.parser.add_argument('--num_layers', type=int, default=2, help='number of layers (default: 5)') self.parser.add_argument( '--num_mlp_layers', type=int, default=2, help='number of MLP layers(default: 2). 1 means linear model.') self.parser.add_argument('--hidden_dim', type=int, default=512, help='number of hidden units (default: 64)') # graph self.parser.add_argument( '--graph_pooling_type', type=str, default="sum", choices=["sum", "mean", "max"], help='type of graph pooling: sum, mean or max') self.parser.add_argument( '--neighbor_pooling_type', type=str, default="sum", choices=["sum", "mean", "max"], help='type of neighboring pooling: sum, mean or max') self.parser.add_argument('--learn_eps', action="store_true", help='learn the epsilon weighting') # learning self.parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') self.parser.add_argument( '--epochs', type=int, default=20, help='number of epochs to train (default: 350)') self.parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') self.parser.add_argument('--final_dropout', type=float, default=0.5, help='final layer dropout (default: 0.5)') self.parser.add_argument("--self-loop", action='store_true', help="graph self-loop (default=False)") self.parser.add_argument("--feat_size", type=int, default=256, help="feature size") self.parser.add_argument("--n_classes", type=int, default=1, help="world size") self.parser.add_argument("--world_size", type=int, default=3, help="world size") self.parser.add_argument("--logs_dir", type=str, default="./logs", help="logs dir") self.parser.add_argument("--input_graph", type=str, default="", help="input graph") self.parser.add_argument("--cached_dir", type=str, default="", help="Cached dir") self.parser.add_argument("--local_rank", default=-1, type=int, help="Distributed local rank") self.parser.add_argument("--node_rank", default=-1, type=int, help="Distributed node_rank") self.parser.add_argument("--nproc_per_node", default=-1, type=int, help="Distributed process per node") self.parser.add_argument("--master_addr", default="localhost", type=str, help="Master address") # done self.args = self.parser.parse_args()
def extract_dataset(): parser = argparse.ArgumentParser(description='DATA') register_data_args(parser) args = parser.parse_args() dataset_name = [ 'cora', 'citeseer', 'pubmed', 'reddit', 'CoraFull', 'Coauthor_cs', 'Coauthor_physics', 'AmazonCoBuy_computers', 'AmazonCoBuy_photo' ] print("Now PATH IS ", os.getcwd()) for name in dataset_name: ''' if os.path.exists(name): print('Folder exists. Skipping ' + name) continue ''' if name in ['cora', 'citeseer', 'pubmed', 'reddit']: args.dataset = name print('args.dataset = ', args.dataset) if not os.path.exists(name): os.mkdir(name) os.chdir(name) print("Now PATH IS ", os.getcwd()) data = load_data(args) features = data.features labels = data.labels graph = data.graph edges = graph.edges train_mask = data.train_mask val_mask = data.val_mask test_mask = data.test_mask n_nodes = features.shape[0] n_edges = data.graph.number_of_edges if args.dataset == 'reddit': graph, features, labels, train_mask, val_mask, test_mask = cut_graph( graph, n_nodes, n_edges, features, labels, train_mask, val_mask, test_mask, 0.85) #edge_x = np.append(edge_x, edge_y, axis=1) edges_list = np.array([]) first_element = True if name is not 'reddit': for item in edges: if first_element: edges_list = np.array([[item[0], item[1]]]) first_element = False else: edges_list = np.append(edges_list, np.array([[item[0], item[1]]]), axis=0) if name == 'reddit': edges = graph.edges() edge_x = edges[0].numpy().reshape((-1, 1)) print(edge_x.shape) edge_y = edges[1].numpy().reshape((-1, 1)) edges_list = np.hstack((edge_x, edge_y)) print(edges_list.shape, edge_x.shape, edge_y.shape) print('features_shape', features.shape) print('labels_shape', labels.shape) print('edges_shape', edges_list.shape) ''' np.savetxt('edges.txt', edges_list) np.savetxt('features.txt', features) np.savetxt('labels.txt', labels) np.savetxt('train_mask.txt', train_mask) np.savetxt('val_mask.txt', val_mask) np.savetxt('test_mask.txt', test_mask) ''' np.save('edges.npy', edges_list) np.save('features.npy', features) np.save('labels.npy', labels) np.save('train_mask.npy', train_mask) print('Finish writing dataset', name) os.chdir('..') print('change to ', os.getcwd()) else: if not os.path.exists(name): os.mkdir(name) os.chdir(name) if name == 'CoraFull': data = CoraFull() elif name == 'Coauthor_cs': data = Coauthor('cs') elif name == 'Coauthor_physics': data = Coauthor('physics') elif name == 'AmazonCoBuy_computers': data = AmazonCoBuy('computers') elif name == 'AmazonCoBuy_photo': data = AmazonCoBuy('photo') else: raise Exception("No such a dataset {}".format(name)) graph = data.data[0] features = torch.FloatTensor(graph.ndata['feat']).numpy() labels = torch.LongTensor(graph.ndata['label']).numpy() print('dataset ', name) features_shape = features.shape labels_shape = labels.shape n_nodes = features_shape[0] edges_u, edges_v = graph.all_edges() edges_u = edges_u.numpy() edges_v = edges_v.numpy() edges_list = np.array([]) first_element = True for idx in range(len(edges_u)): if first_element: edges_list = np.array([[edges_u[idx], edges_v[idx]]]) first_element = False else: edges_list = np.append(edges_list, np.array( [[edges_u[idx], edges_v[idx]]]), axis=0) print('features_shape', features_shape) print('labels_shape', labels_shape) print('edges_shape', edges_list.shape) train_mask = [] for x in range(500): train_mask.append(True) for x in range(n_nodes - 500): train_mask.append(False) train_mask = np.array(train_mask) ''' np.savetxt('edges.txt', edges_list) np.savetxt('features.txt', features) np.savetxt('labels.txt', labels) np.savetxt('train_mask.txt', train_mask) ''' np.save('edges.npy', edges_list) np.save('features.npy', features) np.save('labels.npy', labels) np.save('train_mask.npy', train_mask) print('Finish writing dataset', name) os.chdir('..') print('change to ', os.getcwd())
def main(): parser = argparse.ArgumentParser(description='GCN') register_data_args(parser) parser.add_argument("--iter_per_site", type=int, default=5) parser.add_argument("--num_subnet", type=int, default=2, help="number of sub networks") parser.add_argument("--dropout", type=float, default=0.5, help="dropout probability") parser.add_argument("--lr", type=float, default=0.01, help="learning rate") parser.add_argument("--n-epochs", type=int, default=20, help="number of training epochs") parser.add_argument("--n-hidden", type=int, default=16, help="number of hidden gcn units") parser.add_argument("--n-layers", type=int, default=1, help="number of hidden gcn layers") parser.add_argument("--weight-decay", type=float, default=5e-4, help="Weight for L2 loss") parser.add_argument("--use_layernorm", type=bool, default=False, help="Whether use layernorm (default=False)") parser.add_argument('--dist-backend', type=str, default='nccl', metavar='S', help='backend type for distributed PyTorch') parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:9971', metavar='S', help='master ip for distributed PyTorch') parser.add_argument('--rank', type=int, default=0, metavar='R', help='rank for distributed PyTorch') parser.add_argument('--cuda-id', type=int, default=0, metavar='N', help='cuda index, if the instance has multiple GPUs.') parser.add_argument("--batch-size", type=int, default=20, help="batch size") parser.add_argument("--psize", type=int, default=1500, help="partition number") parser.add_argument("--test-batch-size", type=int, default=1000, help="test batch size") parser.add_argument("--rnd-seed", type=int, default=3, help="number of epoch of doing inference on validation") parser.add_argument("--use-pp", action='store_true', help="whether to use precomputation") parser.add_argument("--normalize", action='store_true', help="whether to use normalized feature") parser.add_argument("--save_results", action='store_true') parser.add_argument('--n-heads', type=int, default=4) parser.add_argument("--exp_name", type=str, default='distributed_gnn_ist') args = parser.parse_args() assert (args.n_hidden % args.num_subnet) == 0 # set all the random seeds print('Setting seeds', flush=True) torch.manual_seed(args.rnd_seed) np.random.seed(args.rnd_seed) random.seed(args.rnd_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # set the proper GPU assert args.cuda_id < torch.cuda.device_count() device = torch.device(f'cuda:{args.cuda_id}') # initialize the distributed process group print(f'{args.rank} initializing process', flush=True) dist.init_process_group( backend=args.dist_backend, init_method=args.dist_url, rank=args.rank, world_size=args.num_subnet) print(f'Process spawned: {args.rank} --> {device}', flush=True) # get the data and setup the dataset dataset = get_data(args, device) (g, cluster_iterator, train_mask, val_mask, test_mask, labels, train_nid, in_feats, n_classes, n_edges) = dataset # get the main model ist_model = DistributedGNNWrapper(args, g, in_feats, n_classes, device) print(f'{args.rank}: start initial dispatch', flush=True) ist_model.ini_sync_dispatch_model() print(f'{args.rank}: finish initial dispatch', flush=True) train( ist_model, args, g, cluster_iterator, labels, train_mask, val_mask, test_mask, train_nid, device)
def main(): parser = argparse.ArgumentParser(description='baseline') register_data_args(parser) parser.add_argument("--mode", type=str, default='A', choices=['A', 'AX', 'X'], help="dropout probability") parser.add_argument("--seed", type=int, default=-1, help="random seed, -1 means dont fix seed") parser.add_argument( "--emb-method", type=str, default='DeepWalk', help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec") parser.add_argument("--ad-method", type=str, default='OCSVM', help="embedding methods: PCA,OCSVM,IF,AE") args = parser.parse_args() if args.seed != -1: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) logging.basicConfig( filename="./log/baseline.log", filemode="a", format="%(asctime)s-%(name)s-%(levelname)s-%(message)s", level=logging.INFO) logger = logging.getLogger('baseline') datadict = emb_dataloader(args) if args.mode == 'X': data = datadict['features'] #print('X shape',data.shape) else: t0 = time.time() embeddings = embedding(args, datadict) dur1 = time.time() - t0 if args.mode == 'A': data = embeddings #print('A shape',data.shape) if args.mode == 'AX': data = np.concatenate((embeddings, datadict['features']), axis=1) #print('AX shape',data.shape) logger.debug(f'data shape: {data.shape}') if args.ad_method == 'OCSVM': clf = OCSVM(contamination=0.1) if args.ad_method == 'IF': clf = IForest(n_estimators=100, contamination=0.1, n_jobs=-1, behaviour="new") if args.ad_method == 'PCA': clf = PCA(contamination=0.1) if args.ad_method == 'AE': clf = AutoEncoder(contamination=0.1) t1 = time.time() clf.fit(data[datadict['train_mask']]) dur2 = time.time() - t1 print('traininig time:', dur1 + dur2) logger.info('\n') logger.info('\n') logger.info( f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}' ) logger.info('-------------Evaluating Validation Results--------------') t2 = time.time() y_pred_val = clf.predict(data[datadict['val_mask']]) y_score_val = clf.decision_function(data[datadict['val_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_val, y_score_val, val=True) dur3 = time.time() - t2 print('infer time:', dur3) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' ) logger.info('-------------Evaluating Test Results--------------') y_pred_test = clf.predict(data[datadict['test_mask']]) y_score_test = clf.decision_function(data[datadict['test_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_test, y_score_test, val=False) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' )