Exemplo n.º 1
0
def load_individuals(pdbs):
    filenames = listdir(Constants.SAVED_GRAPH_PATH)
    dataset = []
    dataset_filenames = []
    for fn in filenames:
        if os.path.splitext(fn)[0] in pdbs:
            graph = load_graphs(os.path.join(Constants.SAVED_GRAPH_PATH, fn))
            graph = graph[0][0]
            dataset.append(graph)
            dataset_filenames.append(os.path.splitext(fn)[0])

    word_to_ixs = load_feat_word_to_ixs(Constants.GENERAL_WORD_TO_IDX_PATH)
    return dataset, dataset_filenames, word_to_ixs
Exemplo n.º 2
0
def load_dataset(filename=None, limit=None, individual=True):
    if filename is None:
        filename = file_name(limit=limit)

    fn_no_extension = os.path.splitext(filename)[0]
    if individual:
        filenames = listdir(Constants.SAVED_GRAPH_PATH)
        i = 0
        dataset = []
        dataset_filenames = []
        for fn in filenames:
            graph = load_graphs(os.path.join(Constants.SAVED_GRAPH_PATH, fn))
            graph = graph[0][0]
            dataset.append(graph)
            dataset_filenames.append(os.path.splitext(fn)[0])
            i += 1
            if limit is not None and i >= limit:
                break
    else:
        dataset = load_graphs(filename)
        dataset = dataset[0]

        filename_df = fn_no_extension + '_filenames.json'
        with open(filename_df, 'r') as f:
            # read the data as binary data stream
            dataset_filenames = json.load(f)

    filename_standardize = fn_no_extension + '_standardize.npy'
    with open(filename_standardize, 'rb') as f:
        mean = torch.load(f)
        std = torch.load(f)

    if individual:
        word_to_ixs = load_feat_word_to_ixs(Constants.GENERAL_WORD_TO_IDX_PATH)
    else:
        filename_wti = fn_no_extension + '_word_to_ix'
        word_to_ixs = load_feat_word_to_ixs(filename_wti)
    return dataset, dataset_filenames, word_to_ixs, (mean, std)
Exemplo n.º 3
0
    def load(self, path):
        """load the graph and the labels

        Parameters
        ----------
        path: str
            Path where to load the graph and the labels
        """
        graph_path = os.path.join(path,
                                  'graph.bin')
        info_path = os.path.join(path,
                                 'info.pkl')
        graphs, _ = load_graphs(graph_path)
        self._g = graphs[0]
        info = load_info(str(info_path))
        self._node_dict = info['node_id_map']
        self._label_map = info['label_map']
        self._is_multilabel = info['is_multilabel']
Exemplo n.º 4
0
def main(args):
    # load and preprocess dataset
    g, graph_labels = load_graphs(
        '/yushi/dataset/Amazon2M/Amazon2M_dglgraph.bin')
    assert len(g) == 1
    g = g[0]
    data = g.ndata
    features = torch.FloatTensor(data['feat'])
    labels = torch.LongTensor(data['label'])
    if hasattr(torch, 'BoolTensor'):
        train_mask = data['train_mask'].bool()
        val_mask = data['val_mask'].bool()
        test_mask = data['test_mask'].bool()
    num_feats = features.shape[1]
    n_classes = 47
    n_edges = g.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d 
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # add self loop
    g = add_self_loop(g)
    # g.remove_edges_from(nx.selfloop_edges(g))
    # g = DGLGraph(g)
    # g.add_edges(g.nodes(), g.nodes())
    n_edges = g.number_of_edges()
    # create model
    heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
    model = GAT(g, args.num_layers, num_feats, args.num_hidden, n_classes,
                heads, F.elu, args.in_drop, args.attn_drop,
                args.negative_slope, args.residual)
    print(model)
    if args.early_stop:
        stopper = EarlyStopping(patience=100)
    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    start = time.time()
    for epoch in range(args.epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        train_acc = accuracy(logits[train_mask], labels[train_mask])

        if args.fastmode:
            val_acc = accuracy(logits[val_mask], labels[val_mask])
        else:
            val_acc = evaluate(model, features, labels, val_mask)
            if args.early_stop:
                if stopper.step(val_acc, model):
                    break

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | TrainAcc {:.4f} |"
              " ValAcc {:.4f} | ETputs(KTEPS) {:.2f}".format(
                  epoch, np.mean(dur), loss.item(), train_acc, val_acc,
                  n_edges / np.mean(dur) / 1000))

    print()
    if args.early_stop:
        model.load_state_dict(torch.load('es_checkpoint.pt'))
    acc = evaluate(model, features, labels, test_mask)
    print("Test Accuracy {:.4f}".format(acc))
    print(f"Time Consuming {np.sum(dur)}, Overall time {time.time() - start}")
Exemplo n.º 5
0
def main(args):
    # load and preprocess dataset
    # data = load_data(args)
    g, graph_labels = load_graphs(
        '/yushi/dataset/Amazon2M/Amazon2M_dglgraph.bin')
    assert len(g) == 1
    g = g[0]
    data = g.ndata
    features = torch.FloatTensor(data['feat'])
    labels = torch.LongTensor(data['label'])
    if hasattr(torch, 'BoolTensor'):
        train_mask = data['train_mask'].bool()
        val_mask = data['val_mask'].bool()
        test_mask = data['test_mask'].bool()
    # else:
    #     train_mask = torch.ByteTensor(data.train_mask)
    #     val_mask = torch.ByteTensor(data.val_mask)
    #     test_mask = torch.ByteTensor(data.test_mask)
    in_feats = features.shape[1]
    n_classes = 47
    n_edges = g.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    # g = data.graph
    # add self loop
    # if args.self_loop:
    #     g.remove_edges_from(nx.selfloop_edges(g))
    #     g.add_edges_from(zip(g.nodes(), g.nodes()))
    # g = DGLGraph(g)
    n_edges = g.number_of_edges()
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # create GCN model
    model = GCN(g, in_feats, args.n_hidden, n_classes, args.n_layers, F.relu,
                args.dropout)

    if cuda:
        model.cuda()
    print(model)
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    start = time.time()
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        acc = evaluate(model, features, labels, val_mask)
        print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                          acc, n_edges / np.mean(dur) / 1000))

    print()
    acc = evaluate(model, features, labels, val_mask)  # no test_mask
    print("Test accuracy {:.2%}".format(acc))
    print(
        f'Training Time Consuming: {np.sum(dur)}, all time cost: {time.time() - start}'
    )
Exemplo n.º 6
0
def main(args):
    torch.manual_seed(args.rnd_seed)
    np.random.seed(args.rnd_seed)
    random.seed(args.rnd_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    multitask_data = set(['ppi'])
    multitask = args.dataset in multitask_data

    # load and preprocess dataset
    assert args.dataset == 'amazon2m'
    g, graph_labels = load_graphs(
        '/yushi/dataset/Amazon2M/Amazon2M_dglgraph.bin')
    assert len(g) == 1
    g = g[0]
    data = g.ndata
    labels = torch.LongTensor(data['label'])
    if hasattr(torch, 'BoolTensor'):
        train_mask = data['train_mask'].bool()
        val_mask = data['val_mask'].bool()
        test_mask = data['test_mask'].bool()

    train_nid = np.nonzero(train_mask.cpu().numpy())[0].astype(np.int64)
    val_nid = np.nonzero(val_mask.cpu().numpy())[0].astype(np.int64)

    # Normalize features
    features = torch.FloatTensor(data['feat'])
    if args.normalize:
        train_feats = features[train_nid]
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit(train_feats)
        features = scaler.transform(features)
    features = torch.FloatTensor(features)

    in_feats = features.shape[1]
    n_classes = 47
    n_edges = g.number_of_edges()

    n_train_samples = train_mask.int().sum().item()
    n_val_samples = val_mask.int().sum().item()
    n_test_samples = test_mask.int().sum().item()

    print("""----Data statistics------'
    #Edges %d
    #Classes %d
    #Train samples %d
    #Val samples %d
    #Test samples %d""" %
          (n_edges, n_classes,
           n_train_samples,
           n_val_samples,
           n_test_samples))
    # create GCN model
    if args.self_loop:
        print("adding self-loop edges")
        g = add_self_loop(g)
    # g = DGLGraph(g, readonly=True)

    # set device for dataset tensors
    if args.gpu < 0:
        cuda = False
        raise ValueError('no cuda')
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    print(torch.cuda.get_device_name(0))

    g.ndata['features'] = features
    g.ndata['labels'] = labels
    g.ndata['train_mask'] = train_mask
    print('labels shape:', labels.shape)
    train_cluster_iterator = ClusterIter(
        args.dataset, g, args.psize, args.batch_size, train_nid, use_pp=args.use_pp)
    val_cluster_iterator = ClusterIter(
        args.dataset, g, args.psize_val, 1, val_nid, use_pp=False)

    print("features shape, ", features.shape)
    model = GraphSAGE(in_feats,
                      args.n_hidden,
                      n_classes,
                      args.n_layers,
                      F.relu,
                      args.dropout,
                      args.use_pp)

    if cuda:
        model.cuda()

    # logger and so on
    log_dir = save_log_dir(args)
    writer = SummaryWriter(log_dir)
    logger = Logger(os.path.join(log_dir, 'loggings'))
    logger.write(args)

    # Loss function
    if multitask:
        print('Using multi-label loss')
        loss_f = nn.BCEWithLogitsLoss()
    else:
        print('Using multi-class loss')
        loss_f = nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # set train_nids to cuda tensor
    if cuda:
        train_nid = torch.from_numpy(train_nid).cuda()
    print("current memory after model before training",
          torch.cuda.memory_allocated(device=train_nid.device) / 1024 / 1024)
    start_time = time.time()
    best_f1 = -1

    for epoch in range(args.n_epochs):
        for j, cluster in enumerate(train_cluster_iterator):
            # sync with upper level training graph
            cluster.copy_from_parent()
            model.train()
            # forward
            pred = model(cluster)
            batch_labels = cluster.ndata['labels']
            batch_train_mask = cluster.ndata['train_mask']
            loss = loss_f(pred[batch_train_mask],
                          batch_labels[batch_train_mask])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # in PPI case, `log_every` is chosen to log one time per epoch.
            # Choose your log freq dynamically when you want more info within one epoch
            if j % args.log_every == 0:
                print(f"epoch:{epoch}/{args.n_epochs}, Iteration {j}/"
                      f"{len(train_cluster_iterator)}:training loss", loss.item())
                writer.add_scalar('train/loss', loss.item(),
                                  global_step=j + epoch * len(train_cluster_iterator))
        print("current memory:",
              torch.cuda.memory_allocated(device=pred.device) / 1024 / 1024)

        # evaluate
        if epoch % args.val_every == 0:
            total_f1_mic = []
            total_f1_mac = []
            model.eval()
            for j, cluster in enumerate(val_cluster_iterator):
                cluster.copy_from_parent()
                with torch.no_grad():
                    logits = model(cluster)
                    batch_labels = cluster.ndata['labels']
                    # batch_val_mask = cluster.ndata['val_mask']
                    val_f1_mic, val_f1_mac = calc_f1(batch_labels.cpu().numpy(),
                                                        logits.cpu().numpy(), multitask)
                total_f1_mic.append(val_f1_mic)
                total_f1_mac.append(val_f1_mac)

            val_f1_mic = np.mean(total_f1_mic)
            val_f1_mac = np.mean(total_f1_mac)

            print(
                "Val F1-mic{:.4f}, Val F1-mac{:.4f}". format(val_f1_mic, val_f1_mac))
            if val_f1_mic > best_f1:
                best_f1 = val_f1_mic
                print('new best val f1:', best_f1)
                torch.save(model.state_dict(), os.path.join(
                    log_dir, 'best_model.pkl'))
            writer.add_scalar('val/f1-mic', val_f1_mic, global_step=epoch)
            writer.add_scalar('val/f1-mac', val_f1_mac, global_step=epoch)

    end_time = time.time()
    print(f'training using time {start_time-end_time}')

    # test
    if args.use_val:
        model.load_state_dict(torch.load(os.path.join(
            log_dir, 'best_model.pkl')))
Exemplo n.º 7
0
def get_predictions(pdb_fn):
    if not pdb_fn.endswith('.pdb'):
        pdb_fn = pdb_fn + '.pdb'
    pdb_id = os.path.splitext(pdb_fn)[0]

    model_tmp = request.args.get('model')
    if model_tmp is None:
        net_tmp = net
    else:
        date_prefix = model_tmp.split('_')[0]
        # print(f'Change model to {date_prefix} -- {model_tmp}')
        net_tmp, loss, _ = my_models.get_model(
            model_name,
            device,
            prefix=date_prefix,
            path=Constants.UPDATED_MODELS_PATH)
        # print(f'loss: {loss}')

    start = time.time()
    print(f'Get predictions for {pdb_fn}')
    try:
        graph = load_graphs(
            os.path.join(Constants.SAVED_GRAPH_PATH,
                         pdb_id + Constants.GRAPH_EXTENSION))
        graph = graph[0][0]
    except Exception as e:
        print(
            e,
            os.path.join(Constants.SAVED_GRAPH_PATH,
                         pdb_id + Constants.GRAPH_EXTENSION))
        return {
            'success': False,
        }

    t = time.time()
    print(f'Preprocessed in {t - start}')

    with open(os.path.join(Constants.PDB_PATH, pdb_fn), 'r') as f:
        pdb_file = f.read()
        f.seek(0)
        with warnings.catch_warnings(record=True):
            bio_model = parser.get_structure(pdb_id, f)[0]
    protein_chains = get_protein_chains(bio_model)
    print(f'Detect proteins in {time.time() - t}')
    t = time.time()

    atom_dict = {}
    predictions = predict_percent(net_tmp, [graph], predict_type=predict_type)
    predictions = smooth_graph(graph, predictions)
    # print(predictions)
    atoms = get_atoms_list(protein_chains)
    for dgl_id, (atom, label) in enumerate(
            zip(atoms, graph.ndata[Constants.LABEL_NODE_NAME])):
        # atom_dict[atom.serial_number] = min(max(is_labeled_positive(atom) + 0.0 + random.uniform(-0.4, 0.4), 0), 1)
        # dgl_id = get_dgl_id(atom)
        # if int(atom.serial_number) != int(serial_number.item()):
        #     print('dafs', dgl_id, int(atom.serial_number), int(serial_number.item()))
        # print(int(serial_number.item()))
        atom_dict[int(atom.serial_number)] = float(predictions[dgl_id])
        # atom_dict[int(serial_number.item())] = float(label.item())
    # print(_get(graph.ndata[Constants.LABEL_NODE_NAME], predictions > 0.30535218, predictions))
    print(f'Predict atoms in  {time.time() - t}')
    t = time.time()

    return {
        'protein_chains': [chain.id for chain in protein_chains],
        'predictions': atom_dict,
        'optimal_threshold': threshold,
        'file': pdb_file,
        'success': True,
    }
Exemplo n.º 8
0
def main(args):
    # load and preprocess dataset
    # data = load_dgl_data(args)
    dataset = args.dataset
    # prefix = '/mnt/yushi/'
    # prefix = 'graphzoom'
    dataset_dir = f'{args.prefix}/dataset/{dataset}'
    # data = load_data(dataset_dir, args.dataset)

    load_data_time = time.time()
    # if dataset in ['Amazon2M', 'reddit']:
    if dataset in ['Amazon2M']:
        g, _ = load_graphs(
            f'{args.prefix}/dataset/Amazon2M/Amazon2M_dglgraph.bin')
        g = g[0]
        data = g.ndata
        features = torch.FloatTensor(data['feat'])
        onehot_labels = F.one_hot(data['label']).numpy()
        train_mask = data['train_mask'].bool()
        val_mask = data['val_mask'].bool()
        test_mask = val_mask
        data = EasyDict({
            'graph': g,
            'labels': data['label'],
            'onehot_labels': onehot_labels,
            'features': data['feat'],
            'train_mask': train_mask,
            'val_mask': val_mask,
            'test_mask': test_mask,
            'num_labels': onehot_labels.shape[1],
            'coarse': False
        })
    else:
        data = load_dgl_data(args)
    original_adj, labels, train_ids, test_ids, train_labels, test_labels, feats = load_data(
        dataset_dir, args.dataset)
    labels = torch.LongTensor(labels)
    train_mask = _sample_mask(train_ids, labels.shape[0])
    onehot_labels = F.one_hot(labels).numpy()
    if dataset == 'reddit':
        g = data.graph
    else:
        g = DGLGraph(data.graph)
        val_ids = test_ids[1000:1500]
        test_ids = test_ids[:1000]
        test_mask = _sample_mask(test_ids, labels.shape[0])
        val_mask = _sample_mask(val_ids, labels.shape[0])
        data = EasyDict({
            'graph': data.graph,
            'labels': labels,
            'onehot_labels': onehot_labels,
            # 'features': feats,
            'features': data.features,
            'train_mask': train_mask,
            'val_mask': val_mask,
            'test_mask': test_mask,
            'num_labels': onehot_labels.shape[1],
            'coarse': False
        })
        # g = DGLGraph(data.graph)
    print(f'load data finished: {time.time() - load_data_time}')
    if args.coarse:
        # * load projection matrix
        levels = args.level
        reduce_results = f"graphzoom/reduction_results/{dataset}/fusion/"
        projections, coarse_adj = construct_proj_laplacian(
            original_adj, levels + 1, reduce_results)  # coarsen levels
        # *calculate coarse feature, labels
        # label_mask = np.expand_dims(data.train_mask, 1)
        # coarse_labels = projections[0] @ (onehot_labels * label_mask)
        print('creating coarse DGLGraph')
        start = time.process_time()
        # ! what will happen if g is assigned to other variables later
        multi_level_dglgraph = [g]
        for i in range(1, len(coarse_adj)):
            g = DGLGraph()
            g.from_scipy_sparse_matrix(coarse_adj[i])
            multi_level_dglgraph.append(g)
            data.features = projections[i - 1] @ data.features
        multi_level_dglgraph.reverse()
        projections.reverse()
        projections = projections[1:]
        for projection in range(len(projections)):
            coo = projections[projection].tocoo()
            values = coo.data
            indices = np.vstack((coo.row, coo.col))
            i = torch.LongTensor(indices)
            v = torch.FloatTensor(values)
            projections[projection] = torch.sparse.FloatTensor(
                i, v, torch.Size(coo.shape)).cuda()
        print(f'creating finished in {time.process_time() - start}')
        # * new train/test masks

        # *replace datao
    labels = torch.LongTensor(data.labels)
    loss_fcn = torch.nn.CrossEntropyLoss()
    features = torch.FloatTensor(data.features)
    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(data.train_mask)
        val_mask = torch.BoolTensor(data.val_mask)
        test_mask = torch.BoolTensor(data.test_mask)
    else:
        train_mask = torch.ByteTensor(data.train_mask)
        val_mask = torch.ByteTensor(data.val_mask)
        test_mask = torch.ByteTensor(data.test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels

    cuda = True
    torch.cuda.set_device(args.gpu)
    features = features.cuda()
    labels = labels.cuda()
    train_mask = train_mask.cuda()
    val_mask = val_mask.cuda()
    test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    # add self loop
    if args.self_loop or args.arch == 'gat':
        for i in range(len(multi_level_dglgraph)):
            multi_level_dglgraph[i] = add_self_loop(multi_level_dglgraph[i])
        print('add self_loop')
    n_edges = multi_level_dglgraph[0].number_of_edges()
    print("""----Data statistics------'
      # Edges %d
      # Classes %d
      # Train samples %d
      # Val samples %d
      # Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # * create GCN model
    heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
    model = create_model(
        args.arch,
        multi_level_dglgraph,
        num_layers=args.level - 1,
        in_dim=in_feats,
        num_hidden=args.num_hidden,
        num_classes=n_classes,
        heads=heads,
        #  activation=F.elu,
        feat_drop=args.in_drop,
        attn_drop=args.attn_drop,
        negative_slope=args.negative_slope,
        residual=args.residual,
        log_softmax=False,
        projection_matrix=projections)

    if cuda:
        model.cuda()
    print(model)
    # loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    acc = 0
    start = time.time()
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits, h = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        # if not args.coarse:
        acc = evaluate(model, features, labels, val_mask)
        print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                          acc, n_edges / np.mean(dur) / 1000))
    print(f'training time: {time.time() - start}')
    # if not args.coarse:
    acc = evaluate(model, features, labels, test_mask)
    # print(h.shape)
    # np.save(f'embeddings/{(args.arch).upper()}_{dataset}_emb_level_1_mask',
    #         h.detach().cpu().numpy())
    # torch.save(model.state_dict(),
    #            f'embeddings/{(args.arch).upper()}_{dataset}_emb_level_1_params.pth.tar',)
    print("Test accuracy {:.2%}".format(acc))
Exemplo n.º 9
0
def main(args):
    # load and preprocess dataset
    # data = load_dgl_data(args)
    dataset = args.dataset
    # prefix = '/mnt/yushi/'
    # prefix = 'graphzoom'
    dataset_dir = f'{args.prefix}/dataset/{dataset}'
    # data = load_data(dataset_dir, args.dataset)

    load_data_time = time.time()
    # if dataset in ['Amazon2M', 'reddit']:
    if dataset in ['Amazon2M']:
        g, _ = load_graphs(
            f'{args.prefix}/dataset/Amazon2M/Amazon2M_dglgraph.bin')
        g = g[0]
        data = g.ndata
        features = torch.FloatTensor(data['feat'])
        onehot_labels = F.one_hot(data['label']).numpy()
        train_mask = data['train_mask'].bool()
        val_mask = data['val_mask'].bool()
        test_mask = val_mask
        data = EasyDict({
            'graph': g,
            'labels': data['label'],
            'onehot_labels': onehot_labels,
            'features': data['feat'],
            'train_mask': train_mask,
            'val_mask': val_mask,
            'test_mask': test_mask,
            'num_labels': onehot_labels.shape[1],
            'coarse': False
        })
    else:
        original_adj, labels, train_ids, test_ids, train_labels, test_labels, feats = load_data(
            dataset_dir, args.dataset)
        data = load_dgl_data(args)
        labels = torch.LongTensor(labels)
        train_mask = _sample_mask(train_ids, labels.shape[0])
        onehot_labels = F.one_hot(labels).numpy()
        if dataset == 'reddit':
            g = data.graph
        else:
            val_ids = test_ids[1000:1500]
            test_ids = test_ids[:1000]
            test_mask = _sample_mask(test_ids, labels.shape[0])
            val_mask = _sample_mask(val_ids, labels.shape[0])
            data = EasyDict({
                'graph': data.graph,
                'labels': labels,
                'onehot_labels': onehot_labels,
                'features': feats,
                # 'features': data.features,
                'train_mask': train_mask,
                'val_mask': val_mask,
                'test_mask': test_mask,
                'num_labels': onehot_labels.shape[1],
                'coarse': False
            })
            # g = DGLGraph(data.graph)
    print(f'load data finished: {time.time() - load_data_time}')
    if args.coarse:
        # * load projection matrix
        levels = args.level
        reduce_results = f"graphzoom/reduction_results/{dataset}/fusion/"
        projections, coarse_adj = construct_proj_laplacian(
            original_adj, levels, reduce_results)
        # *calculate coarse feature, labels
        label_mask = np.expand_dims(data.train_mask, 1)
        onehot_labels = onehot_labels * label_mask
        for i in range(levels):
            data.features = projections[i] @ data.features
            onehot_labels = projections[i] @ onehot_labels
        # coarse_labels = projections[0] @ onehot_labels
        # ! add train_mask
        rows_sum = onehot_labels.sum(axis=1)[:, np.newaxis]
        norm_coarse_labels = onehot_labels / rows_sum
        norm_label_entropy = Categorical(
            torch.Tensor(norm_coarse_labels)).entropy()
        label_entropy_mask = torch.BoolTensor(norm_label_entropy < 0.01)
        coarse_train_mask = torch.BoolTensor(onehot_labels.sum(axis=1))
        # coarse_train_mask = label_entropy_mask
        # ! entropy threshold

        # coarse_graph = nx.Graph(coarse_adj[1])
        print('creating coarse DGLGraph')
        start = time.process_time()
        g = DGLGraph()
        g.from_scipy_sparse_matrix(coarse_adj[1])
        print(f'creating finished in {time.process_time() - start}')
        # list(map(np.shape, [coarse_embed, coarse_labels]))
        # * new train/test masks
        coarsen_ratio = projections[0].shape[1] / projections[0].shape[0]
        # coarse_train_mask = _sample_mask(
        #     range(int(coarsen_ratio*len(data.train_mask.int().sum().item()))),
        #     norm_coarse_labels.shape[0])
        # coarse_train_mask = _sample_mask(
        #     range(norm_coarse_labels.shape[0]),
        # range(60),
        # norm_coarse_labels.shape[0])
        # coarse_test_mask = _sample_mask(
        #     range(100, 700), norm_coarse_labels.shape[0])
        # coarse_val_mask = _sample_mask(
        #     range(700, 1000), norm_coarse_labels.shape[0])

        # *replace data
        data = EasyDict({
            'graph': g,
            'labels': onehot_labels,
            #     'onehot_labels': onehot_labels,
            'features': data.features,
            'train_mask': coarse_train_mask,
            # 'val_mask': coarse_val_mask,
            # 'test_mask': coarse_test_mask,
            'num_classes': norm_coarse_labels.shape[1],
            'num_labels': onehot_labels.shape[1],
            'coarse': True
        })
    if args.coarse:
        labels = torch.FloatTensor(data.labels)
        loss_fcn = torch.nn.KLDivLoss(reduction='batchmean')
        print('training coarse')
    else:
        labels = torch.LongTensor(data.labels)
        loss_fcn = torch.nn.CrossEntropyLoss()
    features = torch.FloatTensor(data.features)
    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(data.train_mask)
        val_mask = torch.BoolTensor(data.val_mask)
        test_mask = torch.BoolTensor(data.test_mask)
    else:
        train_mask = torch.ByteTensor(data.train_mask)
        val_mask = torch.ByteTensor(data.val_mask)
        test_mask = torch.ByteTensor(data.test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    # add self loop
    if args.self_loop or args.arch == 'gat':
        g = add_self_loop(data.graph)
        print('add self_loop')
    n_edges = g.number_of_edges()
    print("""----Data statistics------'
      # Edges %d
      # Classes %d
      # Train samples %d
      # Val samples %d
      # Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # * create GCN model
    heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
    model = create_model(
        args.arch,
        g,
        num_layers=args.num_layers,
        in_dim=in_feats,
        num_hidden=args.num_hidden,
        num_classes=n_classes,
        heads=heads,
        #  activation=F.elu,
        feat_drop=args.in_drop,
        attn_drop=args.attn_drop,
        negative_slope=args.negative_slope,
        residual=args.residual,
        log_softmax=args.coarse)

    if cuda:
        model.cuda()
    print(model)
    # loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    acc = 0
    start = time.time()
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits, h = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])  # ?

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        if not args.coarse:
            acc = evaluate(model, features, labels, val_mask)
        print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                          acc, n_edges / np.mean(dur) / 1000))
    print(f'training time: {time.time() - start}')
    if not args.coarse:
        acc = evaluate(model, features, labels, test_mask)
    print(h.shape)
    np.save(f'embeddings/{(args.arch).upper()}_{dataset}_emb_level_1_mask',
            h.detach().cpu().numpy())
    torch.save(
        model.state_dict(),
        f'embeddings/{(args.arch).upper()}_{dataset}_emb_level_1_params.pth.tar',
    )
    print("Test accuracy {:.2%}".format(acc))
Exemplo n.º 10
0
 def load_pickled_graph(self, i):
     pickled_filename, _ = self.extract_pickle_filename_and_folder(i)
     g, _ = load_graphs(pickled_filename)
     g = g[0]
     return g