def __init__(self, path: str): ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path) idx_split: _typing.Mapping[str, np.ndarray] = ogbl_dataset.get_idx_split() train_index: _typing.Any = idx_split['train'].tolist() test_index: _typing.Any = idx_split['test'].tolist() val_index: _typing.Any = idx_split['valid'].tolist() super(OGBGCode2Dataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph(({ "feat": torch.from_numpy(data['node_feat']), "node_is_attributed": torch.from_numpy(data["node_is_attributed"]), "node_dfs_order": torch.from_numpy(data["node_dfs_order"]), "node_depth": torch.from_numpy(data["node_depth"]) } if _backend.DependentBackend.is_dgl() else { "x": torch.from_numpy(data['node_feat']), "node_is_attributed": torch.from_numpy(data["node_is_attributed"]), "node_dfs_order": torch.from_numpy(data["node_dfs_order"]), "node_depth": torch.from_numpy(data["node_depth"]) }), torch.from_numpy(data['edge_index'])) for data, label in ogbl_dataset ], train_index, val_index, test_index)
class Dataset(BaseDataset): def __init__(self, args): self.args = args self.raw_dataset = GraphPropPredDataset(name=args.dataset_name) self.num_tasks = self.raw_dataset.num_tasks self.eval_metrics = self.raw_dataset.eval_metric self.task_type = self.raw_dataset.task_type self.pgl_graph_list = [] self.graph_label_list = [] for i in range(len(self.raw_dataset)): graph, label = self.raw_dataset[i] edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges) if graph["edge_feat"] is not None: g.edge_feat["feat"] = graph["edge_feat"] if graph["node_feat"] is not None: g.node_feat["feat"] = graph["node_feat"] self.pgl_graph_list.append(g) self.graph_label_list.append(label) def __getitem__(self, idx): return self.pgl_graph_list[idx], self.graph_label_list[idx] def __len__(self): return len(slef.pgl_graph_list) def get_idx_split(self): return self.raw_dataset.get_idx_split()
class OGBGDataset(Dataset): def __init__(self, root, name): super(OGBGDataset, self).__init__(root) self.name = name self.dataset = GraphPropPredDataset(self.name, root) self.graphs = [] self.all_nodes = 0 self.all_edges = 0 for i in range(len(self.dataset.graphs)): graph, label = self.dataset[i] data = Graph( x=torch.tensor(graph["node_feat"], dtype=torch.float), edge_index=torch.tensor(graph["edge_index"]), edge_attr=None if "edge_feat" not in graph else torch.tensor( graph["edge_feat"], dtype=torch.float), y=torch.tensor(label), ) data.num_nodes = graph["num_nodes"] self.graphs.append(data) self.all_nodes += graph["num_nodes"] self.all_edges += graph["edge_index"].shape[1] self.transform = None def get_loader(self, args): split_index = self.dataset.get_idx_split() train_loader = DataLoader(self.get_subset(split_index["train"]), batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(self.get_subset(split_index["valid"]), batch_size=args.batch_size, shuffle=False) test_loader = DataLoader(self.get_subset(split_index["test"]), batch_size=args.batch_size, shuffle=False) return train_loader, valid_loader, test_loader def get_subset(self, subset): datalist = [] for idx in subset: datalist.append(self.graphs[idx]) return datalist def get(self, idx): return self.graphs[idx] def _download(self): pass def _process(self): pass @property def num_classes(self): return int(self.dataset.num_classes)
def __init__(self, path: str): ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path) idx_split: _typing.Mapping[str, np.ndarray] = ogbl_dataset.get_idx_split() train_index: _typing.Any = idx_split['train'].tolist() test_index: _typing.Any = idx_split['test'].tolist() val_index: _typing.Any = idx_split['valid'].tolist() super(OGBGPPADataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph( {'_NID': torch.arange(data['num_nodes'])}, torch.from_numpy(data['edge_index']), {'edge_feat': torch.from_numpy(data['edge_feat'])}, ({ 'label': torch.from_numpy(label) } if _backend.DependentBackend.is_dgl() else { 'y': torch.from_numpy(label) })) for data, label in ogbl_dataset ], train_index, val_index, test_index)
batch_size = 32 # Batch size ################################################################################ # Load data ################################################################################ dataset_name = "ogbg-molhiv" ogb_dataset = GraphPropPredDataset(name=dataset_name) dataset = OGB(ogb_dataset) # Parameters F = dataset.n_node_features # Dimension of node features S = dataset.n_edge_features # Dimension of edge features n_out = dataset.n_labels # Dimension of the target # Train/test split idx = ogb_dataset.get_idx_split() idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"] dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs) loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1) ################################################################################ # Build model ################################################################################ X_in = Input(shape=(F,)) A_in = Input(shape=(None,), sparse=True) E_in = Input(shape=(S,)) I_in = Input(shape=(), dtype=tf.int64)
def main(_): tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32") dset_name = 'ogbg-molhiv' dataset = GraphPropPredDataset(name=dset_name, ) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True) val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False) strategy = xpu.configure_and_get_strategy() if FLAGS.total_batch_size is not None: gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size else: gradient_accumulation_factor = 1 # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing, # but is found to be fairly consistent) steps = { 32: (1195, 162, 148), 64: (585, 80, 73), 128: (288, 40, 37), 256: (143, 20, 18) } try: steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size] except KeyError: print("Batch size should have the number of steps defined") raise KeyError() # need the steps per epoch to be divisible by the gradient accumulation factor steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor) # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128 batch_size = FLAGS.total_batch_size or FLAGS.batch_size lr = FLAGS.lr * batch_size / 128 with strategy.scope(): model = create_model() utils.print_trainable_variables(model) losses = tf.keras.losses.BinaryCrossentropy() if FLAGS.opt.lower() == 'sgd': opt = tf.keras.optimizers.SGD(learning_rate=lr) elif FLAGS.opt.lower() == 'adam': opt = tf.keras.optimizers.Adam(learning_rate=lr) else: raise NotImplementedError() callbacks = [] if not os.path.isdir(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # randomly named directory model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4())) print(f"Saving weights to {model_dir}") model_path = os.path.join(model_dir, 'model') callbacks.append(tf.keras.callbacks.ModelCheckpoint( model_path, monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, mode="min", save_freq="epoch") ) callbacks.append(ThroughputCallback( samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor)) if FLAGS.reduce_lr_on_plateau_patience > 0: callbacks.append(tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor, patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1) ) if FLAGS.early_stopping_patience > 0: print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.") callbacks.append( tf.keras.callbacks.EarlyStopping( monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience, verbose=1, mode='min', baseline=None, restore_best_weights=False) ) # weighted metrics are used because of the batch packing model.compile(optimizer=opt, loss=losses, weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()], steps_per_execution=steps_per_epoch) # if the total batch size exceeds the compute batch size model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor) model.fit(ds, steps_per_epoch=steps_per_epoch, epochs=FLAGS.epochs, validation_data=val_ds, validation_steps=val_steps_per_epoch, callbacks=callbacks ) # we will use the official AUC evaluator from the OGB repo, not the keras one model.load_weights(model_path) print("Loaded best validation weights for evaluation") evaluator = Evaluator(name='ogbg-molhiv') for test_or_val, idx, steps in zip( ('validation', 'test'), (valid_idx, test_idx), (val_steps_per_epoch, test_steps_per_epoch)): prediction, ground_truth = get_predictions(model, dataset, idx, steps) result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]}) print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
################################################################################ # PARAMETERS ################################################################################ learning_rate = 1e-3 # Learning rate epochs = 10 # Number of training epochs batch_size = 32 # Batch size ################################################################################ # LOAD DATA ################################################################################ dataset_name = 'ogbg-molhiv' dataset = GraphPropPredDataset(name=dataset_name) n_out = dataset.num_tasks idx = dataset.get_idx_split() tr_idx, va_idx, te_idx = idx["train"], idx["valid"], idx["test"] X_tr, A_tr, E_tr, y_tr = ogb.dataset_to_numpy(dataset, tr_idx, dtype='f8') X_va, A_va, E_va, y_va = ogb.dataset_to_numpy(dataset, va_idx, dtype='f8') X_te, A_te, E_te, y_te = ogb.dataset_to_numpy(dataset, te_idx, dtype='f8') F = X_tr[0].shape[-1] S = E_tr[0].shape[-1] ################################################################################ # BUILD MODEL ################################################################################ X_in = Input(shape=(F, )) A_in = Input(shape=(None, ), sparse=True) E_in = Input(shape=(S, ))
def main(args): all_probs = {} all_ap = {} all_rocs = {} train_label_props = {} n_estimators = 1000 max_tasks = None run_times = 10 eval_scores = [] test_scores = [] mgf_file = "./dataset/%s/mgf_feat.npy" % (args.dataset_name.replace("-", "_")) soft_mgf_file = "./dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")) maccs_file = "./dataset/%s/maccs_feat.npy" % (args.dataset_name.replace("-", "_")) mgf_feat = np.load(mgf_file) soft_mgf_feat = np.load(soft_mgf_file) maccs_feat = np.load(maccs_file) mgf_dim = mgf_feat.shape[1] maccs_dim = maccs_feat.shape[1] dataset = GraphPropPredDataset(name=args.dataset_name) smiles_file = "dataset/%s/mapping/mol.csv.gz" % (args.dataset_name.replace("-", "_")) df_smi = pd.read_csv(smiles_file) smiles = df_smi["smiles"] outcomes = df_smi.set_index("smiles").drop(["mol_id"], axis=1) feat = np.concatenate([mgf_feat, soft_mgf_feat, maccs_feat], axis=1) X = pd.DataFrame(feat, index=smiles, columns=[i for i in range(feat.shape[1])]) # Split into train/val/test split_idx = dataset.get_idx_split() train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] X_train, X_val, X_test = X.iloc[train_idx], X.iloc[val_idx], X.iloc[test_idx] for rep in range(run_times): for oo in tqdm(outcomes.columns[:max_tasks]): # Get probabilities val_key = args.dataset_name, oo, rep, "val" test_key = args.dataset_name, oo, rep, "test" # If re-running, skip finished runs if val_key in all_probs: print("Skipping", val_key[:-1]) continue # Split outcome in to train/val/test Y = outcomes[oo] y_train, y_val, y_test = Y.loc[X_train.index], Y.loc[X_val.index], Y.loc[X_test.index] # Skip outcomes with no positive training examples if y_train.sum() == 0: continue # Remove missing labels in validation y_val, y_test = y_val.dropna(), y_test.dropna() X_v, X_t = X_val.loc[y_val.index], X_test.loc[y_test.index] # Remove missing values in the training labels, and downsample imbalance to cut runtime y_tr = y_train.dropna() train_label_props[args.dataset_name, oo, rep] = y_tr.mean() print(f"Sampled label balance:\n{y_tr.value_counts()}") # Fit model print("Fitting model...") rf = RandomForestClassifier(min_samples_leaf=2, n_estimators=n_estimators, n_jobs=-1, criterion='entropy', class_weight={0:1, 1:10} ) rf.fit(X_train.loc[y_tr.index], y_tr) # Calculate probabilities all_probs[val_key] = pd.Series(rf.predict_proba(X_v)[:, 1], index=X_v.index) all_probs[test_key] = pd.Series(rf.predict_proba(X_t)[:, 1], index=X_t.index) if y_val.sum() > 0: all_ap[val_key] = average_precision_score(y_val, all_probs[val_key]) all_rocs[val_key] = roc_auc_score(y_val, all_probs[val_key]) if y_test.sum() > 0: all_ap[test_key] = average_precision_score(y_test, all_probs[test_key]) all_rocs[test_key] = roc_auc_score(y_test, all_probs[test_key]) print(f'{oo}, rep {rep}, AP (val, test): {all_ap.get(val_key, np.nan):.3f}, {all_ap.get(test_key, np.nan):.3f}') print(f'\tROC (val, test): {all_rocs.get(val_key, np.nan):.3f}, {all_rocs.get(test_key, np.nan):.3f}') eval_scores.append(all_rocs.get(val_key, np.nan)) test_scores.append(all_rocs.get(test_key, np.nan)) eval_avg = np.mean(eval_scores) eval_std = np.std(eval_scores, ddof=1) test_avg = np.mean(test_scores) test_std = np.std(test_scores, ddof=1) print("eval: ", eval_scores) print("test: ", test_scores) print("%s | eval and test: %.4f (%.4f),%.4f (%.4f)" % (args.dataset_name, eval_avg, eval_std, test_avg, test_std))
worker = mp_reader.multiprocess_reader(worker_pool, use_pipe=True, queue_size=1000) r = paddle.reader.buffered(worker, self.buf_size) for batch in r(): yield batch def scan(self): """scan""" for example in self.dataset: yield example if __name__ == "__main__": from base_dataset import BaseDataset, Subset dataset = GraphPropPredDataset(name="ogbg-molhiv") splitted_index = dataset.get_idx_split() train_dataset = Subset(dataset, splitted_index['train']) valid_dataset = Subset(dataset, splitted_index['valid']) test_dataset = Subset(dataset, splitted_index['test']) log.info("Train Examples: %s" % len(train_dataset)) log.info("Val Examples: %s" % len(valid_dataset)) log.info("Test Examples: %s" % len(test_dataset)) # train_loader = GraphDataloader(train_dataset, batch_size=3) # for batch_data in train_loader: # graphs, labels = batch_data # print(labels.shape) # time.sleep(4)
def main(config): if dist.get_world_size() > 1: dist.init_parallel_env() if dist.get_rank() == 0: timestamp = datetime.now().strftime("%Hh%Mm%Ss") log_path = os.path.join(config.log_dir, "tensorboard_log_%s" % timestamp) writer = SummaryWriter(log_path) log.info("loading data") raw_dataset = GraphPropPredDataset(name=config.dataset_name) config.num_class = raw_dataset.num_tasks config.eval_metric = raw_dataset.eval_metric config.task_type = raw_dataset.task_type mol_dataset = MolDataset(config, raw_dataset, transform=make_multihop_edges) splitted_index = raw_dataset.get_idx_split() train_ds = Subset(mol_dataset, splitted_index['train'], mode='train') valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid") test_ds = Subset(mol_dataset, splitted_index['test'], mode="test") log.info("Train Examples: %s" % len(train_ds)) log.info("Val Examples: %s" % len(valid_ds)) log.info("Test Examples: %s" % len(test_ds)) fn = CollateFn(config) train_loader = Dataloader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, collate_fn=fn) valid_loader = Dataloader(valid_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) test_loader = Dataloader(test_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) model = ClassifierNetwork(config.hidden_size, config.out_dim, config.num_layers, config.dropout_prob, config.virt_node, config.K, config.conv_type, config.appnp_hop, config.alpha) model = paddle.DataParallel(model) optim = Adam(learning_rate=config.lr, parameters=model.parameters()) criterion = nn.loss.BCEWithLogitsLoss() evaluator = Evaluator(config.dataset_name) best_valid = 0 global_step = 0 for epoch in range(1, config.epochs + 1): model.train() for idx, batch_data in enumerate(train_loader): g, mh_graphs, labels, unmask = batch_data g = g.tensor() multihop_graphs = [] for item in mh_graphs: multihop_graphs.append(item.tensor()) g.multi_hop_graphs = multihop_graphs labels = paddle.to_tensor(labels) unmask = paddle.to_tensor(unmask) pred = model(g) pred = paddle.masked_select(pred, unmask) labels = paddle.masked_select(labels, unmask) train_loss = criterion(pred, labels) train_loss.backward() optim.step() optim.clear_grad() if global_step % 80 == 0: message = "train: epoch %d | step %d | " % (epoch, global_step) message += "loss %.6f" % (train_loss.numpy()) log.info(message) if dist.get_rank() == 0: writer.add_scalar("loss", train_loss.numpy(), global_step) global_step += 1 valid_result = evaluate(model, valid_loader, criterion, evaluator) message = "valid: epoch %d | step %d | " % (epoch, global_step) for key, value in valid_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("valid_%s" % key, value, global_step) log.info(message) test_result = evaluate(model, test_loader, criterion, evaluator) message = "test: epoch %d | step %d | " % (epoch, global_step) for key, value in test_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("test_%s" % key, value, global_step) log.info(message) if best_valid < valid_result[config.metrics]: best_valid = valid_result[config.metrics] best_valid_result = valid_result best_test_result = test_result message = "best result: epoch %d | " % (epoch) message += "valid %s: %.6f | " % (config.metrics, best_valid_result[config.metrics]) message += "test %s: %.6f | " % (config.metrics, best_test_result[config.metrics]) log.info(message) message = "final eval best result:%.6f" % best_valid_result[config.metrics] log.info(message) message = "final test best result:%.6f" % best_test_result[config.metrics] log.info(message)
batch_valid = (labels == labels).astype("bool") labels = np.nan_to_num(labels).astype("float32") g = pgl.Graph.batch(graph_list) multihop_graphs = [] for g_list in multihop_graph_list: multihop_graphs.append(pgl.Graph.batch(g_list)) return g, multihop_graphs, labels, batch_valid if __name__ == "__main__": config = prepare_config("pcba_config.yaml", isCreate=False, isSave=False) raw_dataset = GraphPropPredDataset(name=config.dataset_name) ds = MolDataset(config, raw_dataset, transform=make_multihop_edges) splitted_index = raw_dataset.get_idx_split() train_ds = Subset(ds, splitted_index['train'], mode='train') valid_ds = Subset(ds, splitted_index['valid'], mode="valid") test_ds = Subset(ds, splitted_index['test'], mode="test") Fn = CollateFn(config) loader = Dataloader(train_ds, batch_size=3, shuffle=False, num_workers=4, collate_fn=Fn) for batch_data in loader: print("batch", batch_data[0][0].node_feat) g = pgl.Graph.batch(batch_data[0]) print(g.node_feat) time.sleep(3)