예제 #1
0
 def __init__(self, path: str):
     ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path)
     idx_split: _typing.Mapping[str,
                                np.ndarray] = ogbl_dataset.get_idx_split()
     train_index: _typing.Any = idx_split['train'].tolist()
     test_index: _typing.Any = idx_split['test'].tolist()
     val_index: _typing.Any = idx_split['valid'].tolist()
     super(OGBGCode2Dataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(({
             "feat":
             torch.from_numpy(data['node_feat']),
             "node_is_attributed":
             torch.from_numpy(data["node_is_attributed"]),
             "node_dfs_order":
             torch.from_numpy(data["node_dfs_order"]),
             "node_depth":
             torch.from_numpy(data["node_depth"])
         } if _backend.DependentBackend.is_dgl() else {
             "x":
             torch.from_numpy(data['node_feat']),
             "node_is_attributed":
             torch.from_numpy(data["node_is_attributed"]),
             "node_dfs_order":
             torch.from_numpy(data["node_dfs_order"]),
             "node_depth":
             torch.from_numpy(data["node_depth"])
         }), torch.from_numpy(data['edge_index']))
         for data, label in ogbl_dataset
     ], train_index, val_index, test_index)
예제 #2
0
파일: base_dataset.py 프로젝트: zzs95/PGL
class Dataset(BaseDataset):
    def __init__(self, args):
        self.args = args
        self.raw_dataset = GraphPropPredDataset(name=args.dataset_name)
        self.num_tasks = self.raw_dataset.num_tasks
        self.eval_metrics = self.raw_dataset.eval_metric
        self.task_type = self.raw_dataset.task_type

        self.pgl_graph_list = []
        self.graph_label_list = []
        for i in range(len(self.raw_dataset)):
            graph, label = self.raw_dataset[i]
            edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
            g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)

            if graph["edge_feat"] is not None:
                g.edge_feat["feat"] = graph["edge_feat"]

            if graph["node_feat"] is not None:
                g.node_feat["feat"] = graph["node_feat"]

            self.pgl_graph_list.append(g)
            self.graph_label_list.append(label)

    def __getitem__(self, idx):
        return self.pgl_graph_list[idx], self.graph_label_list[idx]

    def __len__(self):
        return len(slef.pgl_graph_list)

    def get_idx_split(self):
        return self.raw_dataset.get_idx_split()
예제 #3
0
파일: ogb.py 프로젝트: sofyc/cogdl
class OGBGDataset(Dataset):
    def __init__(self, root, name):
        super(OGBGDataset, self).__init__(root)
        self.name = name
        self.dataset = GraphPropPredDataset(self.name, root)

        self.graphs = []
        self.all_nodes = 0
        self.all_edges = 0
        for i in range(len(self.dataset.graphs)):
            graph, label = self.dataset[i]
            data = Graph(
                x=torch.tensor(graph["node_feat"], dtype=torch.float),
                edge_index=torch.tensor(graph["edge_index"]),
                edge_attr=None if "edge_feat" not in graph else torch.tensor(
                    graph["edge_feat"], dtype=torch.float),
                y=torch.tensor(label),
            )
            data.num_nodes = graph["num_nodes"]
            self.graphs.append(data)

            self.all_nodes += graph["num_nodes"]
            self.all_edges += graph["edge_index"].shape[1]

        self.transform = None

    def get_loader(self, args):
        split_index = self.dataset.get_idx_split()
        train_loader = DataLoader(self.get_subset(split_index["train"]),
                                  batch_size=args.batch_size,
                                  shuffle=True)
        valid_loader = DataLoader(self.get_subset(split_index["valid"]),
                                  batch_size=args.batch_size,
                                  shuffle=False)
        test_loader = DataLoader(self.get_subset(split_index["test"]),
                                 batch_size=args.batch_size,
                                 shuffle=False)
        return train_loader, valid_loader, test_loader

    def get_subset(self, subset):
        datalist = []
        for idx in subset:
            datalist.append(self.graphs[idx])
        return datalist

    def get(self, idx):
        return self.graphs[idx]

    def _download(self):
        pass

    def _process(self):
        pass

    @property
    def num_classes(self):
        return int(self.dataset.num_classes)
예제 #4
0
 def __init__(self, path: str):
     ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path)
     idx_split: _typing.Mapping[str,
                                np.ndarray] = ogbl_dataset.get_idx_split()
     train_index: _typing.Any = idx_split['train'].tolist()
     test_index: _typing.Any = idx_split['test'].tolist()
     val_index: _typing.Any = idx_split['valid'].tolist()
     super(OGBGPPADataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {'_NID': torch.arange(data['num_nodes'])},
             torch.from_numpy(data['edge_index']),
             {'edge_feat': torch.from_numpy(data['edge_feat'])},
             ({
                 'label': torch.from_numpy(label)
             } if _backend.DependentBackend.is_dgl() else {
                 'y': torch.from_numpy(label)
             })) for data, label in ogbl_dataset
     ], train_index, val_index, test_index)
예제 #5
0
batch_size = 32  # Batch size

################################################################################
# Load data
################################################################################
dataset_name = "ogbg-molhiv"
ogb_dataset = GraphPropPredDataset(name=dataset_name)
dataset = OGB(ogb_dataset)

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/test split
idx = ogb_dataset.get_idx_split()
idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)

################################################################################
# Build model
################################################################################
X_in = Input(shape=(F,))
A_in = Input(shape=(None,), sparse=True)
E_in = Input(shape=(S,))
I_in = Input(shape=(), dtype=tf.int64)
예제 #6
0
def main(_):
    tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32")

    dset_name = 'ogbg-molhiv'
    dataset = GraphPropPredDataset(name=dset_name, )
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True)
    val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False)
    strategy = xpu.configure_and_get_strategy()

    if FLAGS.total_batch_size is not None:
        gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size
    else:
        gradient_accumulation_factor = 1

    # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing,
    #  but is found to be fairly consistent)
    steps = {
        32: (1195, 162, 148),
        64: (585, 80, 73),
        128: (288, 40, 37),
        256: (143, 20, 18)
    }
    try:
        steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size]
    except KeyError:
        print("Batch size should have the number of steps defined")
        raise KeyError()

    # need the steps per epoch to be divisible by the gradient accumulation factor
    steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor)

    # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128
    batch_size = FLAGS.total_batch_size or FLAGS.batch_size
    lr = FLAGS.lr * batch_size / 128

    with strategy.scope():
        model = create_model()
        utils.print_trainable_variables(model)

        losses = tf.keras.losses.BinaryCrossentropy()
        if FLAGS.opt.lower() == 'sgd':
            opt = tf.keras.optimizers.SGD(learning_rate=lr)
        elif FLAGS.opt.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(learning_rate=lr)
        else:
            raise NotImplementedError()

        callbacks = []

        if not os.path.isdir(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        # randomly named directory
        model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4()))

        print(f"Saving weights to {model_dir}")
        model_path = os.path.join(model_dir, 'model')

        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            model_path, monitor="val_loss", verbose=1, save_best_only=True,
            save_weights_only=True, mode="min", save_freq="epoch")
        )

        callbacks.append(ThroughputCallback(
            samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor))
        if FLAGS.reduce_lr_on_plateau_patience > 0:
            callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor,
                patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1)
            )

        if FLAGS.early_stopping_patience > 0:
            print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.")
            callbacks.append(
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience,
                    verbose=1, mode='min', baseline=None, restore_best_weights=False)
            )

        # weighted metrics are used because of the batch packing
        model.compile(optimizer=opt, loss=losses,
                      weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
                      steps_per_execution=steps_per_epoch)

        # if the total batch size exceeds the compute batch size
        model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor)

        model.fit(ds,
                  steps_per_epoch=steps_per_epoch,
                  epochs=FLAGS.epochs,
                  validation_data=val_ds,
                  validation_steps=val_steps_per_epoch,
                  callbacks=callbacks
                  )

        # we will use the official AUC evaluator from the OGB repo, not the keras one
        model.load_weights(model_path)
        print("Loaded best validation weights for evaluation")

        evaluator = Evaluator(name='ogbg-molhiv')
        for test_or_val, idx, steps in zip(
                ('validation', 'test'),
                (valid_idx, test_idx),
                (val_steps_per_epoch, test_steps_per_epoch)):
            prediction, ground_truth = get_predictions(model, dataset, idx, steps)
            result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]})

            print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
예제 #7
0
################################################################################
# PARAMETERS
################################################################################
learning_rate = 1e-3  # Learning rate
epochs = 10  # Number of training epochs
batch_size = 32  # Batch size

################################################################################
# LOAD DATA
################################################################################
dataset_name = 'ogbg-molhiv'
dataset = GraphPropPredDataset(name=dataset_name)
n_out = dataset.num_tasks

idx = dataset.get_idx_split()
tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"]

X_tr, A_tr, E_tr, y_tr = ogb.dataset_to_numpy(dataset, tr_idx, dtype='f8')
X_va, A_va, E_va, y_va = ogb.dataset_to_numpy(dataset, va_idx, dtype='f8')
X_te, A_te, E_te, y_te = ogb.dataset_to_numpy(dataset, te_idx, dtype='f8')

F = X_tr[0].shape[-1]
S = E_tr[0].shape[-1]

################################################################################
# BUILD MODEL
################################################################################
X_in = Input(shape=(F, ))
A_in = Input(shape=(None, ), sparse=True)
E_in = Input(shape=(S, ))
예제 #8
0
def main(args):
    all_probs = {}
    all_ap = {}
    all_rocs = {}
    train_label_props = {}

    n_estimators = 1000
    max_tasks = None
    run_times = 10

    eval_scores = []
    test_scores = []

    mgf_file = "./dataset/%s/mgf_feat.npy" % (args.dataset_name.replace("-", "_"))
    soft_mgf_file = "./dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_"))
    maccs_file = "./dataset/%s/maccs_feat.npy" % (args.dataset_name.replace("-", "_"))
    mgf_feat = np.load(mgf_file)
    soft_mgf_feat = np.load(soft_mgf_file)
    maccs_feat = np.load(maccs_file)
    mgf_dim = mgf_feat.shape[1]
    maccs_dim = maccs_feat.shape[1]

    dataset = GraphPropPredDataset(name=args.dataset_name)
    smiles_file = "dataset/%s/mapping/mol.csv.gz" % (args.dataset_name.replace("-", "_"))
    df_smi = pd.read_csv(smiles_file)
    smiles = df_smi["smiles"]
    outcomes = df_smi.set_index("smiles").drop(["mol_id"], axis=1)

    feat = np.concatenate([mgf_feat, soft_mgf_feat, maccs_feat], axis=1)
    X =  pd.DataFrame(feat, 
            index=smiles,
            columns=[i for i in range(feat.shape[1])])

    # Split into train/val/test
    split_idx = dataset.get_idx_split()
    train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    X_train, X_val, X_test = X.iloc[train_idx], X.iloc[val_idx], X.iloc[test_idx]

    for rep in range(run_times):
        for oo in tqdm(outcomes.columns[:max_tasks]):
            # Get probabilities
            val_key = args.dataset_name, oo, rep, "val"
            test_key = args.dataset_name, oo, rep, "test"

            # If re-running, skip finished runs
            if val_key in all_probs:
                print("Skipping", val_key[:-1])
                continue

            # Split outcome in to train/val/test
            Y = outcomes[oo]
            y_train, y_val, y_test = Y.loc[X_train.index], Y.loc[X_val.index], Y.loc[X_test.index]

            # Skip outcomes with no positive training examples
            if y_train.sum() == 0:
                continue

            # Remove missing labels in validation
            y_val, y_test = y_val.dropna(), y_test.dropna()
            X_v, X_t = X_val.loc[y_val.index], X_test.loc[y_test.index]
            
            # Remove missing values in the training labels, and downsample imbalance to cut runtime
            y_tr = y_train.dropna()
            train_label_props[args.dataset_name, oo, rep] = y_tr.mean()
            print(f"Sampled label balance:\n{y_tr.value_counts()}")

            # Fit model
            print("Fitting model...")
            rf = RandomForestClassifier(min_samples_leaf=2,
                    n_estimators=n_estimators,
                    n_jobs=-1,
                    criterion='entropy',
                    class_weight={0:1, 1:10}
                    )
            rf.fit(X_train.loc[y_tr.index], y_tr)

            # Calculate probabilities
            all_probs[val_key] = pd.Series(rf.predict_proba(X_v)[:, 1], index=X_v.index)
            all_probs[test_key] = pd.Series(rf.predict_proba(X_t)[:, 1], index=X_t.index)

            if y_val.sum() > 0:
                all_ap[val_key] = average_precision_score(y_val, all_probs[val_key])
                all_rocs[val_key] = roc_auc_score(y_val, all_probs[val_key])
            
            if y_test.sum() > 0:
                all_ap[test_key] = average_precision_score(y_test, all_probs[test_key])
                all_rocs[test_key] = roc_auc_score(y_test, all_probs[test_key])

            print(f'{oo}, rep {rep}, AP (val, test): {all_ap.get(val_key, np.nan):.3f}, {all_ap.get(test_key, np.nan):.3f}')
            print(f'\tROC (val, test): {all_rocs.get(val_key, np.nan):.3f}, {all_rocs.get(test_key, np.nan):.3f}')
            eval_scores.append(all_rocs.get(val_key, np.nan))
            test_scores.append(all_rocs.get(test_key, np.nan))

    eval_avg = np.mean(eval_scores)
    eval_std = np.std(eval_scores, ddof=1)
    test_avg = np.mean(test_scores)
    test_std = np.std(test_scores, ddof=1)
    print("eval: ", eval_scores)
    print("test: ", test_scores)
    print("%s | eval and test: %.4f (%.4f),%.4f (%.4f)" % (args.dataset_name, eval_avg, eval_std, test_avg, test_std))
예제 #9
0
파일: dataloader.py 프로젝트: zzs95/PGL
            worker = mp_reader.multiprocess_reader(worker_pool,
                                                   use_pipe=True,
                                                   queue_size=1000)
            r = paddle.reader.buffered(worker, self.buf_size)

        for batch in r():
            yield batch

    def scan(self):
        """scan"""
        for example in self.dataset:
            yield example


if __name__ == "__main__":
    from base_dataset import BaseDataset, Subset
    dataset = GraphPropPredDataset(name="ogbg-molhiv")
    splitted_index = dataset.get_idx_split()
    train_dataset = Subset(dataset, splitted_index['train'])
    valid_dataset = Subset(dataset, splitted_index['valid'])
    test_dataset = Subset(dataset, splitted_index['test'])
    log.info("Train Examples: %s" % len(train_dataset))
    log.info("Val Examples: %s" % len(valid_dataset))
    log.info("Test Examples: %s" % len(test_dataset))

    #  train_loader = GraphDataloader(train_dataset, batch_size=3)
    #  for batch_data in train_loader:
    #      graphs, labels = batch_data
    #      print(labels.shape)
    #      time.sleep(4)
예제 #10
0
파일: main.py 프로젝트: Yelrose/PGL
def main(config):
    if dist.get_world_size() > 1:
        dist.init_parallel_env()

    if dist.get_rank() == 0:
        timestamp = datetime.now().strftime("%Hh%Mm%Ss")
        log_path = os.path.join(config.log_dir,
                                "tensorboard_log_%s" % timestamp)
        writer = SummaryWriter(log_path)

    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=config.dataset_name)
    config.num_class = raw_dataset.num_tasks
    config.eval_metric = raw_dataset.eval_metric
    config.task_type = raw_dataset.task_type

    mol_dataset = MolDataset(config,
                             raw_dataset,
                             transform=make_multihop_edges)
    splitted_index = raw_dataset.get_idx_split()
    train_ds = Subset(mol_dataset, splitted_index['train'], mode='train')
    valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid")
    test_ds = Subset(mol_dataset, splitted_index['test'], mode="test")

    log.info("Train Examples: %s" % len(train_ds))
    log.info("Val Examples: %s" % len(valid_ds))
    log.info("Test Examples: %s" % len(test_ds))

    fn = CollateFn(config)

    train_loader = Dataloader(train_ds,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    valid_loader = Dataloader(valid_ds,
                              batch_size=config.batch_size,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    test_loader = Dataloader(test_ds,
                             batch_size=config.batch_size,
                             num_workers=config.num_workers,
                             collate_fn=fn)

    model = ClassifierNetwork(config.hidden_size, config.out_dim,
                              config.num_layers, config.dropout_prob,
                              config.virt_node, config.K, config.conv_type,
                              config.appnp_hop, config.alpha)
    model = paddle.DataParallel(model)

    optim = Adam(learning_rate=config.lr, parameters=model.parameters())
    criterion = nn.loss.BCEWithLogitsLoss()

    evaluator = Evaluator(config.dataset_name)

    best_valid = 0

    global_step = 0
    for epoch in range(1, config.epochs + 1):
        model.train()
        for idx, batch_data in enumerate(train_loader):
            g, mh_graphs, labels, unmask = batch_data
            g = g.tensor()
            multihop_graphs = []
            for item in mh_graphs:
                multihop_graphs.append(item.tensor())
            g.multi_hop_graphs = multihop_graphs
            labels = paddle.to_tensor(labels)
            unmask = paddle.to_tensor(unmask)

            pred = model(g)
            pred = paddle.masked_select(pred, unmask)
            labels = paddle.masked_select(labels, unmask)
            train_loss = criterion(pred, labels)
            train_loss.backward()
            optim.step()
            optim.clear_grad()

            if global_step % 80 == 0:
                message = "train: epoch %d | step %d | " % (epoch, global_step)
                message += "loss %.6f" % (train_loss.numpy())
                log.info(message)
                if dist.get_rank() == 0:
                    writer.add_scalar("loss", train_loss.numpy(), global_step)
            global_step += 1

        valid_result = evaluate(model, valid_loader, criterion, evaluator)
        message = "valid: epoch %d | step %d | " % (epoch, global_step)
        for key, value in valid_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("valid_%s" % key, value, global_step)
        log.info(message)

        test_result = evaluate(model, test_loader, criterion, evaluator)
        message = "test: epoch %d | step %d | " % (epoch, global_step)
        for key, value in test_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("test_%s" % key, value, global_step)
        log.info(message)

        if best_valid < valid_result[config.metrics]:
            best_valid = valid_result[config.metrics]
            best_valid_result = valid_result
            best_test_result = test_result

        message = "best result: epoch %d | " % (epoch)
        message += "valid %s: %.6f | " % (config.metrics,
                                          best_valid_result[config.metrics])
        message += "test %s: %.6f | " % (config.metrics,
                                         best_test_result[config.metrics])
        log.info(message)

    message = "final eval best result:%.6f" % best_valid_result[config.metrics]
    log.info(message)
    message = "final test best result:%.6f" % best_test_result[config.metrics]
    log.info(message)
예제 #11
0
        batch_valid = (labels == labels).astype("bool")
        labels = np.nan_to_num(labels).astype("float32")

        g = pgl.Graph.batch(graph_list)
        multihop_graphs = []
        for g_list in multihop_graph_list:
            multihop_graphs.append(pgl.Graph.batch(g_list))

        return g, multihop_graphs, labels, batch_valid


if __name__ == "__main__":
    config = prepare_config("pcba_config.yaml", isCreate=False, isSave=False)
    raw_dataset = GraphPropPredDataset(name=config.dataset_name)
    ds = MolDataset(config, raw_dataset, transform=make_multihop_edges)
    splitted_index = raw_dataset.get_idx_split()
    train_ds = Subset(ds, splitted_index['train'], mode='train')
    valid_ds = Subset(ds, splitted_index['valid'], mode="valid")
    test_ds = Subset(ds, splitted_index['test'], mode="test")

    Fn = CollateFn(config)
    loader = Dataloader(train_ds,
                        batch_size=3,
                        shuffle=False,
                        num_workers=4,
                        collate_fn=Fn)
    for batch_data in loader:
        print("batch", batch_data[0][0].node_feat)
        g = pgl.Graph.batch(batch_data[0])
        print(g.node_feat)
        time.sleep(3)