def main(dataset_name):
    dataset = GraphPropPredDataset(name=dataset_name)

    df_smi = pd.read_csv(f"dataset/{dataset_name}/mapping/mol.csv.gz".replace(
        "-", "_"))
    smiles = df_smi["smiles"]

    mgf_feat_list = []
    maccs_feat_list = []
    for ii in tqdm(range(len(smiles))):
        rdkit_mol = AllChem.MolFromSmiles(smiles.iloc[ii])

        mgf = getmorganfingerprint(rdkit_mol)
        mgf_feat_list.append(mgf)

        maccs = getmaccsfingerprint(rdkit_mol)
        maccs_feat_list.append(maccs)

    mgf_feat = np.array(mgf_feat_list, dtype="int64")
    maccs_feat = np.array(maccs_feat_list, dtype="int64")
    print("morgan feature shape: ", mgf_feat.shape)
    print("maccs feature shape: ", maccs_feat.shape)

    save_path = f"./dataset/{dataset_name}".replace("-", "_")
    print("saving feature in %s" % save_path)
    np.save(os.path.join(save_path, "mgf_feat.npy"), mgf_feat)
    np.save(os.path.join(save_path, "maccs_feat.npy"), maccs_feat)
Пример #2
0
def infer(args):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    test_ds = MolDataset(args, raw_dataset, mode="test")

    fn = MgfCollateFn(args, mode="test")

    test_loader = Dataloader(test_ds,
                             batch_size=args.batch_size,
                             num_workers=1,
                             collate_fn=fn)
    test_loader = PDataset.from_generator_func(test_loader)

    est = propeller.Learner(MgfModel, args, args.model_config)

    mgf_list = []
    for soft_mgf in est.predict(test_loader,
                                ckpt_path=args.model_path_for_infer,
                                split_batch=True):
        mgf_list.append(soft_mgf)

    mgf = np.concatenate(mgf_list)
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)
Пример #3
0
 def __init__(self, path: str):
     ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path)
     idx_split: _typing.Mapping[str,
                                np.ndarray] = ogbl_dataset.get_idx_split()
     train_index: _typing.Any = idx_split['train'].tolist()
     test_index: _typing.Any = idx_split['test'].tolist()
     val_index: _typing.Any = idx_split['valid'].tolist()
     super(OGBGCode2Dataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(({
             "feat":
             torch.from_numpy(data['node_feat']),
             "node_is_attributed":
             torch.from_numpy(data["node_is_attributed"]),
             "node_dfs_order":
             torch.from_numpy(data["node_dfs_order"]),
             "node_depth":
             torch.from_numpy(data["node_depth"])
         } if _backend.DependentBackend.is_dgl() else {
             "x":
             torch.from_numpy(data['node_feat']),
             "node_is_attributed":
             torch.from_numpy(data["node_is_attributed"]),
             "node_dfs_order":
             torch.from_numpy(data["node_dfs_order"]),
             "node_depth":
             torch.from_numpy(data["node_depth"])
         }), torch.from_numpy(data['edge_index']))
         for data, label in ogbl_dataset
     ], train_index, val_index, test_index)
Пример #4
0
 def __init__(self, path: str):
     ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path)
     idx_split: _typing.Mapping[str,
                                np.ndarray] = ogbl_dataset.get_idx_split()
     train_index: _typing.Any = idx_split['train'].tolist()
     test_index: _typing.Any = idx_split['test'].tolist()
     val_index: _typing.Any = idx_split['valid'].tolist()
     super(OGBGPPADataset, self).__init__([
         GeneralStaticGraphGenerator.create_homogeneous_static_graph(
             {'_NID': torch.arange(data['num_nodes'])},
             torch.from_numpy(data['edge_index']),
             {'edge_feat': torch.from_numpy(data['edge_feat'])},
             ({
                 'label': torch.from_numpy(label)
             } if _backend.DependentBackend.is_dgl() else {
                 'y': torch.from_numpy(label)
             })) for data, label in ogbl_dataset
     ], train_index, val_index, test_index)
Пример #5
0
    def __init__(self, args):
        self.args = args
        self.raw_dataset = GraphPropPredDataset(name=args.dataset_name)
        self.num_tasks = self.raw_dataset.num_tasks
        self.eval_metrics = self.raw_dataset.eval_metric
        self.task_type = self.raw_dataset.task_type

        self.pgl_graph_list = []
        self.graph_label_list = []
        for i in range(len(self.raw_dataset)):
            graph, label = self.raw_dataset[i]
            edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
            g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)

            if graph["edge_feat"] is not None:
                g.edge_feat["feat"] = graph["edge_feat"]

            if graph["node_feat"] is not None:
                g.node_feat["feat"] = graph["node_feat"]

            self.pgl_graph_list.append(g)
            self.graph_label_list.append(label)
Пример #6
0
    def __init__(self, root, name):
        super(OGBGDataset, self).__init__(root)
        self.name = name
        self.dataset = GraphPropPredDataset(self.name, root)

        self.graphs = []
        self.all_nodes = 0
        self.all_edges = 0
        for i in range(len(self.dataset.graphs)):
            graph, label = self.dataset[i]
            data = Graph(
                x=torch.tensor(graph["node_feat"], dtype=torch.float),
                edge_index=torch.tensor(graph["edge_index"]),
                edge_attr=None if "edge_feat" not in graph else torch.tensor(graph["edge_feat"], dtype=torch.float),
                y=torch.tensor(label),
            )
            data.num_nodes = graph["num_nodes"]
            self.graphs.append(data)

            self.all_nodes += graph["num_nodes"]
            self.all_edges += graph["edge_index"].shape[1]

        self.transform = None
Пример #7
0
from spektral.data import DisjointLoader
from spektral.datasets import OGB
from spektral.layers import ECCConv, GlobalSumPool

################################################################################
# Config
################################################################################
learning_rate = 1e-3  # Learning rate
epochs = 10  # Number of training epochs
batch_size = 32  # Batch size

################################################################################
# Load data
################################################################################
dataset_name = "ogbg-molhiv"
ogb_dataset = GraphPropPredDataset(name=dataset_name)
dataset = OGB(ogb_dataset)

# Parameters
F = dataset.n_node_features  # Dimension of node features
S = dataset.n_edge_features  # Dimension of edge features
n_out = dataset.n_labels  # Dimension of the target

# Train/test split
idx = ogb_dataset.get_idx_split()
idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
dataset_tr = dataset[idx_tr]
dataset_va = dataset[idx_va]
dataset_te = dataset[idx_te]

loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
Пример #8
0
def main(_):
    tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32")

    dset_name = 'ogbg-molhiv'
    dataset = GraphPropPredDataset(name=dset_name, )
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True)
    val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False)
    strategy = xpu.configure_and_get_strategy()

    if FLAGS.total_batch_size is not None:
        gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size
    else:
        gradient_accumulation_factor = 1

    # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing,
    #  but is found to be fairly consistent)
    steps = {
        32: (1195, 162, 148),
        64: (585, 80, 73),
        128: (288, 40, 37),
        256: (143, 20, 18)
    }
    try:
        steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size]
    except KeyError:
        print("Batch size should have the number of steps defined")
        raise KeyError()

    # need the steps per epoch to be divisible by the gradient accumulation factor
    steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor)

    # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128
    batch_size = FLAGS.total_batch_size or FLAGS.batch_size
    lr = FLAGS.lr * batch_size / 128

    with strategy.scope():
        model = create_model()
        utils.print_trainable_variables(model)

        losses = tf.keras.losses.BinaryCrossentropy()
        if FLAGS.opt.lower() == 'sgd':
            opt = tf.keras.optimizers.SGD(learning_rate=lr)
        elif FLAGS.opt.lower() == 'adam':
            opt = tf.keras.optimizers.Adam(learning_rate=lr)
        else:
            raise NotImplementedError()

        callbacks = []

        if not os.path.isdir(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)
        # randomly named directory
        model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4()))

        print(f"Saving weights to {model_dir}")
        model_path = os.path.join(model_dir, 'model')

        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            model_path, monitor="val_loss", verbose=1, save_best_only=True,
            save_weights_only=True, mode="min", save_freq="epoch")
        )

        callbacks.append(ThroughputCallback(
            samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor))
        if FLAGS.reduce_lr_on_plateau_patience > 0:
            callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor,
                patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1)
            )

        if FLAGS.early_stopping_patience > 0:
            print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.")
            callbacks.append(
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience,
                    verbose=1, mode='min', baseline=None, restore_best_weights=False)
            )

        # weighted metrics are used because of the batch packing
        model.compile(optimizer=opt, loss=losses,
                      weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
                      steps_per_execution=steps_per_epoch)

        # if the total batch size exceeds the compute batch size
        model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor)

        model.fit(ds,
                  steps_per_epoch=steps_per_epoch,
                  epochs=FLAGS.epochs,
                  validation_data=val_ds,
                  validation_steps=val_steps_per_epoch,
                  callbacks=callbacks
                  )

        # we will use the official AUC evaluator from the OGB repo, not the keras one
        model.load_weights(model_path)
        print("Loaded best validation weights for evaluation")

        evaluator = Evaluator(name='ogbg-molhiv')
        for test_or_val, idx, steps in zip(
                ('validation', 'test'),
                (valid_idx, test_idx),
                (val_steps_per_epoch, test_steps_per_epoch)):
            prediction, ground_truth = get_predictions(model, dataset, idx, steps)
            result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]})

            print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
Пример #9
0
def main(args):
    all_probs = {}
    all_ap = {}
    all_rocs = {}
    train_label_props = {}

    n_estimators = 1000
    max_tasks = None
    run_times = 10

    eval_scores = []
    test_scores = []

    mgf_file = "./dataset/%s/mgf_feat.npy" % (args.dataset_name.replace("-", "_"))
    soft_mgf_file = "./dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_"))
    maccs_file = "./dataset/%s/maccs_feat.npy" % (args.dataset_name.replace("-", "_"))
    mgf_feat = np.load(mgf_file)
    soft_mgf_feat = np.load(soft_mgf_file)
    maccs_feat = np.load(maccs_file)
    mgf_dim = mgf_feat.shape[1]
    maccs_dim = maccs_feat.shape[1]

    dataset = GraphPropPredDataset(name=args.dataset_name)
    smiles_file = "dataset/%s/mapping/mol.csv.gz" % (args.dataset_name.replace("-", "_"))
    df_smi = pd.read_csv(smiles_file)
    smiles = df_smi["smiles"]
    outcomes = df_smi.set_index("smiles").drop(["mol_id"], axis=1)

    feat = np.concatenate([mgf_feat, soft_mgf_feat, maccs_feat], axis=1)
    X =  pd.DataFrame(feat, 
            index=smiles,
            columns=[i for i in range(feat.shape[1])])

    # Split into train/val/test
    split_idx = dataset.get_idx_split()
    train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    X_train, X_val, X_test = X.iloc[train_idx], X.iloc[val_idx], X.iloc[test_idx]

    for rep in range(run_times):
        for oo in tqdm(outcomes.columns[:max_tasks]):
            # Get probabilities
            val_key = args.dataset_name, oo, rep, "val"
            test_key = args.dataset_name, oo, rep, "test"

            # If re-running, skip finished runs
            if val_key in all_probs:
                print("Skipping", val_key[:-1])
                continue

            # Split outcome in to train/val/test
            Y = outcomes[oo]
            y_train, y_val, y_test = Y.loc[X_train.index], Y.loc[X_val.index], Y.loc[X_test.index]

            # Skip outcomes with no positive training examples
            if y_train.sum() == 0:
                continue

            # Remove missing labels in validation
            y_val, y_test = y_val.dropna(), y_test.dropna()
            X_v, X_t = X_val.loc[y_val.index], X_test.loc[y_test.index]
            
            # Remove missing values in the training labels, and downsample imbalance to cut runtime
            y_tr = y_train.dropna()
            train_label_props[args.dataset_name, oo, rep] = y_tr.mean()
            print(f"Sampled label balance:\n{y_tr.value_counts()}")

            # Fit model
            print("Fitting model...")
            rf = RandomForestClassifier(min_samples_leaf=2,
                    n_estimators=n_estimators,
                    n_jobs=-1,
                    criterion='entropy',
                    class_weight={0:1, 1:10}
                    )
            rf.fit(X_train.loc[y_tr.index], y_tr)

            # Calculate probabilities
            all_probs[val_key] = pd.Series(rf.predict_proba(X_v)[:, 1], index=X_v.index)
            all_probs[test_key] = pd.Series(rf.predict_proba(X_t)[:, 1], index=X_t.index)

            if y_val.sum() > 0:
                all_ap[val_key] = average_precision_score(y_val, all_probs[val_key])
                all_rocs[val_key] = roc_auc_score(y_val, all_probs[val_key])
            
            if y_test.sum() > 0:
                all_ap[test_key] = average_precision_score(y_test, all_probs[test_key])
                all_rocs[test_key] = roc_auc_score(y_test, all_probs[test_key])

            print(f'{oo}, rep {rep}, AP (val, test): {all_ap.get(val_key, np.nan):.3f}, {all_ap.get(test_key, np.nan):.3f}')
            print(f'\tROC (val, test): {all_rocs.get(val_key, np.nan):.3f}, {all_rocs.get(test_key, np.nan):.3f}')
            eval_scores.append(all_rocs.get(val_key, np.nan))
            test_scores.append(all_rocs.get(test_key, np.nan))

    eval_avg = np.mean(eval_scores)
    eval_std = np.std(eval_scores, ddof=1)
    test_avg = np.mean(test_scores)
    test_std = np.std(test_scores, ddof=1)
    print("eval: ", eval_scores)
    print("test: ", test_scores)
    print("%s | eval and test: %.4f (%.4f),%.4f (%.4f)" % (args.dataset_name, eval_avg, eval_std, test_avg, test_std))
Пример #10
0
            worker = mp_reader.multiprocess_reader(worker_pool,
                                                   use_pipe=True,
                                                   queue_size=1000)
            r = paddle.reader.buffered(worker, self.buf_size)

        for batch in r():
            yield batch

    def scan(self):
        """scan"""
        for example in self.dataset:
            yield example


if __name__ == "__main__":
    from base_dataset import BaseDataset, Subset
    dataset = GraphPropPredDataset(name="ogbg-molhiv")
    splitted_index = dataset.get_idx_split()
    train_dataset = Subset(dataset, splitted_index['train'])
    valid_dataset = Subset(dataset, splitted_index['valid'])
    test_dataset = Subset(dataset, splitted_index['test'])
    log.info("Train Examples: %s" % len(train_dataset))
    log.info("Val Examples: %s" % len(valid_dataset))
    log.info("Test Examples: %s" % len(test_dataset))

    #  train_loader = GraphDataloader(train_dataset, batch_size=3)
    #  for batch_data in train_loader:
    #      graphs, labels = batch_data
    #      print(labels.shape)
    #      time.sleep(4)
Пример #11
0
        return x_embedding


class BondEncoder(nn.Layer):
    def __init__(self, emb_dim):
        super(BondEncoder, self).__init__()

        self.bond_embedding_list = nn.LayerList()

        for i, dim in enumerate(full_bond_feature_dims):
            weight_attr = nn.initializer.XavierUniform()
            emb = paddle.nn.Embedding(dim, emb_dim, weight_attr=weight_attr)
            self.bond_embedding_list.append(emb)

    def forward(self, edge_attr):
        bond_embedding = 0
        for i in range(edge_attr.shape[1]):
            bond_embedding += self.bond_embedding_list[i](edge_attr[:, i])

        return bond_embedding


if __name__ == '__main__':
    from ogb.graphproppred import GraphPropPredDataset
    dataset = GraphPropPredDataset(name='ogbg-molpcba')
    atom_enc = AtomEncoder(100)
    bond_enc = BondEncoder(100)

    print(atom_enc(dataset[0].x))
    print(bond_enc(dataset[0].edge_attr))
Пример #12
0
def test_datasetsaver():
    # test on graph classification
    # ogbg-molhiv

    test_task = 'link'

    # testing all the dataset objects are working.
    if test_task == 'graph':
        from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset
        dataset_name = 'ogbg-molhiv'
        dataset = PygGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = GraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'node':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-arxiv'  # test ogbn-proteins
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'link':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-collab'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    elif test_task == 'heteronode':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-mag'
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'heterolink':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-biokg'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    else:
        raise ValueError('Invalid task category')

    print(dataset[0])
    if 'link' in test_task:
        print(dataset.get_edge_split())
    else:
        print(dataset.get_idx_split())

    if 'graph' in test_task:
        graph_list = dataset.graphs
    else:
        graph_list = [dataset.graph]

    if 'link' not in test_task:
        labels = dataset.labels

    is_hetero = 'hetero' in test_task
    version = 2 if dataset_name == 'ogbn-mag' else 1
    saver = DatasetSaver(dataset_name, is_hetero, version=version)

    # saving graph objects
    saver.save_graph_list(graph_list)
    # saving target labels
    if 'link' not in test_task:
        saver.save_target_labels(labels)
    # saving split
    if 'link' in test_task:
        split_idx = dataset.get_edge_split()
    else:
        split_idx = dataset.get_idx_split()
    # second argument must be the name of the split
    saver.save_split(split_idx, dataset.meta_info['split'])
    # copying mapping dir
    # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/")
    saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join(
        dataset_name.split('-'))))

    saver.save_task_info(
        dataset.task_type, dataset.eval_metric,
        dataset.num_classes if hasattr(dataset, 'num_classes') else None)

    meta_dict = saver.get_meta_dict()

    print(meta_dict)

    print('Now testing.')

    if 'graph' in test_task:
        print('library agnostic')
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
    elif 'node' in test_task:
        print('library agnostic')
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())

    elif 'link' in test_task:
        print('library agnostic')
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('Pytorch Geometric')
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('DGL')
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
    else:
        raise ValueError('Invalid task category')

    # zip
    saver.zip()
    print('Finished zipping!')

    saver.cleanup()
Пример #13
0
def main(config):
    if dist.get_world_size() > 1:
        dist.init_parallel_env()

    if dist.get_rank() == 0:
        timestamp = datetime.now().strftime("%Hh%Mm%Ss")
        log_path = os.path.join(config.log_dir,
                                "tensorboard_log_%s" % timestamp)
        writer = SummaryWriter(log_path)

    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=config.dataset_name)
    config.num_class = raw_dataset.num_tasks
    config.eval_metric = raw_dataset.eval_metric
    config.task_type = raw_dataset.task_type

    mol_dataset = MolDataset(config,
                             raw_dataset,
                             transform=make_multihop_edges)
    splitted_index = raw_dataset.get_idx_split()
    train_ds = Subset(mol_dataset, splitted_index['train'], mode='train')
    valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid")
    test_ds = Subset(mol_dataset, splitted_index['test'], mode="test")

    log.info("Train Examples: %s" % len(train_ds))
    log.info("Val Examples: %s" % len(valid_ds))
    log.info("Test Examples: %s" % len(test_ds))

    fn = CollateFn(config)

    train_loader = Dataloader(train_ds,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    valid_loader = Dataloader(valid_ds,
                              batch_size=config.batch_size,
                              num_workers=config.num_workers,
                              collate_fn=fn)

    test_loader = Dataloader(test_ds,
                             batch_size=config.batch_size,
                             num_workers=config.num_workers,
                             collate_fn=fn)

    model = ClassifierNetwork(config.hidden_size, config.out_dim,
                              config.num_layers, config.dropout_prob,
                              config.virt_node, config.K, config.conv_type,
                              config.appnp_hop, config.alpha)
    model = paddle.DataParallel(model)

    optim = Adam(learning_rate=config.lr, parameters=model.parameters())
    criterion = nn.loss.BCEWithLogitsLoss()

    evaluator = Evaluator(config.dataset_name)

    best_valid = 0

    global_step = 0
    for epoch in range(1, config.epochs + 1):
        model.train()
        for idx, batch_data in enumerate(train_loader):
            g, mh_graphs, labels, unmask = batch_data
            g = g.tensor()
            multihop_graphs = []
            for item in mh_graphs:
                multihop_graphs.append(item.tensor())
            g.multi_hop_graphs = multihop_graphs
            labels = paddle.to_tensor(labels)
            unmask = paddle.to_tensor(unmask)

            pred = model(g)
            pred = paddle.masked_select(pred, unmask)
            labels = paddle.masked_select(labels, unmask)
            train_loss = criterion(pred, labels)
            train_loss.backward()
            optim.step()
            optim.clear_grad()

            if global_step % 80 == 0:
                message = "train: epoch %d | step %d | " % (epoch, global_step)
                message += "loss %.6f" % (train_loss.numpy())
                log.info(message)
                if dist.get_rank() == 0:
                    writer.add_scalar("loss", train_loss.numpy(), global_step)
            global_step += 1

        valid_result = evaluate(model, valid_loader, criterion, evaluator)
        message = "valid: epoch %d | step %d | " % (epoch, global_step)
        for key, value in valid_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("valid_%s" % key, value, global_step)
        log.info(message)

        test_result = evaluate(model, test_loader, criterion, evaluator)
        message = "test: epoch %d | step %d | " % (epoch, global_step)
        for key, value in test_result.items():
            message += " | %s %.6f" % (key, value)
            if dist.get_rank() == 0:
                writer.add_scalar("test_%s" % key, value, global_step)
        log.info(message)

        if best_valid < valid_result[config.metrics]:
            best_valid = valid_result[config.metrics]
            best_valid_result = valid_result
            best_test_result = test_result

        message = "best result: epoch %d | " % (epoch)
        message += "valid %s: %.6f | " % (config.metrics,
                                          best_valid_result[config.metrics])
        message += "test %s: %.6f | " % (config.metrics,
                                         best_test_result[config.metrics])
        log.info(message)

    message = "final eval best result:%.6f" % best_valid_result[config.metrics]
    log.info(message)
    message = "final test best result:%.6f" % best_test_result[config.metrics]
    log.info(message)
Пример #14
0
def train(args, pretrained_model_config=None):
    log.info("loading data")
    raw_dataset = GraphPropPredDataset(name=args.dataset_name)
    args.num_class = raw_dataset.num_tasks
    args.eval_metric = raw_dataset.eval_metric
    args.task_type = raw_dataset.task_type

    train_ds = MolDataset(args, raw_dataset)

    args.eval_steps = math.ceil(len(train_ds) / args.batch_size)
    log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps))

    fn = MgfCollateFn(args)

    train_loader = Dataloader(train_ds,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=args.shuffle,
                              stream_shuffle_size=args.shuffle_size,
                              collate_fn=fn)

    # for evaluating
    eval_train_loader = train_loader
    eval_train_loader = PDataset.from_generator_func(eval_train_loader)

    train_loader = multi_epoch_dataloader(train_loader, args.epochs)
    train_loader = PDataset.from_generator_func(train_loader)

    if args.warm_start_from is not None:
        # warm start setting
        def _fn(v):
            if not isinstance(v, F.framework.Parameter):
                return False
            if os.path.exists(os.path.join(args.warm_start_from, v.name)):
                return True
            else:
                return False

        ws = propeller.WarmStartSetting(predicate_fn=_fn,
                                        from_dir=args.warm_start_from)
    else:
        ws = None

    def cmp_fn(old, new):
        if old['eval'][args.metrics] - new['eval'][args.metrics] > 0:
            log.info("best %s eval result: %s" % (args.metrics, new['eval']))
            return True
        else:
            return False

    if args.log_id is not None:
        save_best_model = int(args.log_id) == 5
    else:
        save_best_model = True
    best_exporter = propeller.exporter.BestResultExporter(
        args.output_dir, (cmp_fn, save_best_model))

    eval_datasets = {"eval": eval_train_loader}

    propeller.train.train_and_eval(
        model_class_or_model_fn=MgfModel,
        params=pretrained_model_config,
        run_config=args,
        train_dataset=train_loader,
        eval_dataset=eval_datasets,
        warm_start_setting=ws,
        exporters=[best_exporter],
    )
Пример #15
0
        labels = np.array(labels)
        batch_valid = (labels == labels).astype("bool")
        labels = np.nan_to_num(labels).astype("float32")

        g = pgl.Graph.batch(graph_list)
        multihop_graphs = []
        for g_list in multihop_graph_list:
            multihop_graphs.append(pgl.Graph.batch(g_list))

        return g, multihop_graphs, labels, batch_valid


if __name__ == "__main__":
    config = prepare_config("pcba_config.yaml", isCreate=False, isSave=False)
    raw_dataset = GraphPropPredDataset(name=config.dataset_name)
    ds = MolDataset(config, raw_dataset, transform=make_multihop_edges)
    splitted_index = raw_dataset.get_idx_split()
    train_ds = Subset(ds, splitted_index['train'], mode='train')
    valid_ds = Subset(ds, splitted_index['valid'], mode="valid")
    test_ds = Subset(ds, splitted_index['test'], mode="test")

    Fn = CollateFn(config)
    loader = Dataloader(train_ds,
                        batch_size=3,
                        shuffle=False,
                        num_workers=4,
                        collate_fn=Fn)
    for batch_data in loader:
        print("batch", batch_data[0][0].node_feat)
        g = pgl.Graph.batch(batch_data[0])