def main(dataset_name): dataset = GraphPropPredDataset(name=dataset_name) df_smi = pd.read_csv(f"dataset/{dataset_name}/mapping/mol.csv.gz".replace( "-", "_")) smiles = df_smi["smiles"] mgf_feat_list = [] maccs_feat_list = [] for ii in tqdm(range(len(smiles))): rdkit_mol = AllChem.MolFromSmiles(smiles.iloc[ii]) mgf = getmorganfingerprint(rdkit_mol) mgf_feat_list.append(mgf) maccs = getmaccsfingerprint(rdkit_mol) maccs_feat_list.append(maccs) mgf_feat = np.array(mgf_feat_list, dtype="int64") maccs_feat = np.array(maccs_feat_list, dtype="int64") print("morgan feature shape: ", mgf_feat.shape) print("maccs feature shape: ", maccs_feat.shape) save_path = f"./dataset/{dataset_name}".replace("-", "_") print("saving feature in %s" % save_path) np.save(os.path.join(save_path, "mgf_feat.npy"), mgf_feat) np.save(os.path.join(save_path, "maccs_feat.npy"), maccs_feat)
def infer(args): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type test_ds = MolDataset(args, raw_dataset, mode="test") fn = MgfCollateFn(args, mode="test") test_loader = Dataloader(test_ds, batch_size=args.batch_size, num_workers=1, collate_fn=fn) test_loader = PDataset.from_generator_func(test_loader) est = propeller.Learner(MgfModel, args, args.model_config) mgf_list = [] for soft_mgf in est.predict(test_loader, ckpt_path=args.model_path_for_infer, split_batch=True): mgf_list.append(soft_mgf) mgf = np.concatenate(mgf_list) log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf)
def __init__(self, path: str): ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path) idx_split: _typing.Mapping[str, np.ndarray] = ogbl_dataset.get_idx_split() train_index: _typing.Any = idx_split['train'].tolist() test_index: _typing.Any = idx_split['test'].tolist() val_index: _typing.Any = idx_split['valid'].tolist() super(OGBGCode2Dataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph(({ "feat": torch.from_numpy(data['node_feat']), "node_is_attributed": torch.from_numpy(data["node_is_attributed"]), "node_dfs_order": torch.from_numpy(data["node_dfs_order"]), "node_depth": torch.from_numpy(data["node_depth"]) } if _backend.DependentBackend.is_dgl() else { "x": torch.from_numpy(data['node_feat']), "node_is_attributed": torch.from_numpy(data["node_is_attributed"]), "node_dfs_order": torch.from_numpy(data["node_dfs_order"]), "node_depth": torch.from_numpy(data["node_depth"]) }), torch.from_numpy(data['edge_index'])) for data, label in ogbl_dataset ], train_index, val_index, test_index)
def __init__(self, path: str): ogbl_dataset = GraphPropPredDataset("ogbg-molhiv", path) idx_split: _typing.Mapping[str, np.ndarray] = ogbl_dataset.get_idx_split() train_index: _typing.Any = idx_split['train'].tolist() test_index: _typing.Any = idx_split['test'].tolist() val_index: _typing.Any = idx_split['valid'].tolist() super(OGBGPPADataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph( {'_NID': torch.arange(data['num_nodes'])}, torch.from_numpy(data['edge_index']), {'edge_feat': torch.from_numpy(data['edge_feat'])}, ({ 'label': torch.from_numpy(label) } if _backend.DependentBackend.is_dgl() else { 'y': torch.from_numpy(label) })) for data, label in ogbl_dataset ], train_index, val_index, test_index)
def __init__(self, args): self.args = args self.raw_dataset = GraphPropPredDataset(name=args.dataset_name) self.num_tasks = self.raw_dataset.num_tasks self.eval_metrics = self.raw_dataset.eval_metric self.task_type = self.raw_dataset.task_type self.pgl_graph_list = [] self.graph_label_list = [] for i in range(len(self.raw_dataset)): graph, label = self.raw_dataset[i] edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges) if graph["edge_feat"] is not None: g.edge_feat["feat"] = graph["edge_feat"] if graph["node_feat"] is not None: g.node_feat["feat"] = graph["node_feat"] self.pgl_graph_list.append(g) self.graph_label_list.append(label)
def __init__(self, root, name): super(OGBGDataset, self).__init__(root) self.name = name self.dataset = GraphPropPredDataset(self.name, root) self.graphs = [] self.all_nodes = 0 self.all_edges = 0 for i in range(len(self.dataset.graphs)): graph, label = self.dataset[i] data = Graph( x=torch.tensor(graph["node_feat"], dtype=torch.float), edge_index=torch.tensor(graph["edge_index"]), edge_attr=None if "edge_feat" not in graph else torch.tensor(graph["edge_feat"], dtype=torch.float), y=torch.tensor(label), ) data.num_nodes = graph["num_nodes"] self.graphs.append(data) self.all_nodes += graph["num_nodes"] self.all_edges += graph["edge_index"].shape[1] self.transform = None
from spektral.data import DisjointLoader from spektral.datasets import OGB from spektral.layers import ECCConv, GlobalSumPool ################################################################################ # Config ################################################################################ learning_rate = 1e-3 # Learning rate epochs = 10 # Number of training epochs batch_size = 32 # Batch size ################################################################################ # Load data ################################################################################ dataset_name = "ogbg-molhiv" ogb_dataset = GraphPropPredDataset(name=dataset_name) dataset = OGB(ogb_dataset) # Parameters F = dataset.n_node_features # Dimension of node features S = dataset.n_edge_features # Dimension of edge features n_out = dataset.n_labels # Dimension of the target # Train/test split idx = ogb_dataset.get_idx_split() idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"] dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
def main(_): tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32") dset_name = 'ogbg-molhiv' dataset = GraphPropPredDataset(name=dset_name, ) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True) val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False) strategy = xpu.configure_and_get_strategy() if FLAGS.total_batch_size is not None: gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size else: gradient_accumulation_factor = 1 # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing, # but is found to be fairly consistent) steps = { 32: (1195, 162, 148), 64: (585, 80, 73), 128: (288, 40, 37), 256: (143, 20, 18) } try: steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size] except KeyError: print("Batch size should have the number of steps defined") raise KeyError() # need the steps per epoch to be divisible by the gradient accumulation factor steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor) # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128 batch_size = FLAGS.total_batch_size or FLAGS.batch_size lr = FLAGS.lr * batch_size / 128 with strategy.scope(): model = create_model() utils.print_trainable_variables(model) losses = tf.keras.losses.BinaryCrossentropy() if FLAGS.opt.lower() == 'sgd': opt = tf.keras.optimizers.SGD(learning_rate=lr) elif FLAGS.opt.lower() == 'adam': opt = tf.keras.optimizers.Adam(learning_rate=lr) else: raise NotImplementedError() callbacks = [] if not os.path.isdir(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # randomly named directory model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4())) print(f"Saving weights to {model_dir}") model_path = os.path.join(model_dir, 'model') callbacks.append(tf.keras.callbacks.ModelCheckpoint( model_path, monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, mode="min", save_freq="epoch") ) callbacks.append(ThroughputCallback( samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor)) if FLAGS.reduce_lr_on_plateau_patience > 0: callbacks.append(tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor, patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1) ) if FLAGS.early_stopping_patience > 0: print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.") callbacks.append( tf.keras.callbacks.EarlyStopping( monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience, verbose=1, mode='min', baseline=None, restore_best_weights=False) ) # weighted metrics are used because of the batch packing model.compile(optimizer=opt, loss=losses, weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()], steps_per_execution=steps_per_epoch) # if the total batch size exceeds the compute batch size model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor) model.fit(ds, steps_per_epoch=steps_per_epoch, epochs=FLAGS.epochs, validation_data=val_ds, validation_steps=val_steps_per_epoch, callbacks=callbacks ) # we will use the official AUC evaluator from the OGB repo, not the keras one model.load_weights(model_path) print("Loaded best validation weights for evaluation") evaluator = Evaluator(name='ogbg-molhiv') for test_or_val, idx, steps in zip( ('validation', 'test'), (valid_idx, test_idx), (val_steps_per_epoch, test_steps_per_epoch)): prediction, ground_truth = get_predictions(model, dataset, idx, steps) result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]}) print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
def main(args): all_probs = {} all_ap = {} all_rocs = {} train_label_props = {} n_estimators = 1000 max_tasks = None run_times = 10 eval_scores = [] test_scores = [] mgf_file = "./dataset/%s/mgf_feat.npy" % (args.dataset_name.replace("-", "_")) soft_mgf_file = "./dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")) maccs_file = "./dataset/%s/maccs_feat.npy" % (args.dataset_name.replace("-", "_")) mgf_feat = np.load(mgf_file) soft_mgf_feat = np.load(soft_mgf_file) maccs_feat = np.load(maccs_file) mgf_dim = mgf_feat.shape[1] maccs_dim = maccs_feat.shape[1] dataset = GraphPropPredDataset(name=args.dataset_name) smiles_file = "dataset/%s/mapping/mol.csv.gz" % (args.dataset_name.replace("-", "_")) df_smi = pd.read_csv(smiles_file) smiles = df_smi["smiles"] outcomes = df_smi.set_index("smiles").drop(["mol_id"], axis=1) feat = np.concatenate([mgf_feat, soft_mgf_feat, maccs_feat], axis=1) X = pd.DataFrame(feat, index=smiles, columns=[i for i in range(feat.shape[1])]) # Split into train/val/test split_idx = dataset.get_idx_split() train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] X_train, X_val, X_test = X.iloc[train_idx], X.iloc[val_idx], X.iloc[test_idx] for rep in range(run_times): for oo in tqdm(outcomes.columns[:max_tasks]): # Get probabilities val_key = args.dataset_name, oo, rep, "val" test_key = args.dataset_name, oo, rep, "test" # If re-running, skip finished runs if val_key in all_probs: print("Skipping", val_key[:-1]) continue # Split outcome in to train/val/test Y = outcomes[oo] y_train, y_val, y_test = Y.loc[X_train.index], Y.loc[X_val.index], Y.loc[X_test.index] # Skip outcomes with no positive training examples if y_train.sum() == 0: continue # Remove missing labels in validation y_val, y_test = y_val.dropna(), y_test.dropna() X_v, X_t = X_val.loc[y_val.index], X_test.loc[y_test.index] # Remove missing values in the training labels, and downsample imbalance to cut runtime y_tr = y_train.dropna() train_label_props[args.dataset_name, oo, rep] = y_tr.mean() print(f"Sampled label balance:\n{y_tr.value_counts()}") # Fit model print("Fitting model...") rf = RandomForestClassifier(min_samples_leaf=2, n_estimators=n_estimators, n_jobs=-1, criterion='entropy', class_weight={0:1, 1:10} ) rf.fit(X_train.loc[y_tr.index], y_tr) # Calculate probabilities all_probs[val_key] = pd.Series(rf.predict_proba(X_v)[:, 1], index=X_v.index) all_probs[test_key] = pd.Series(rf.predict_proba(X_t)[:, 1], index=X_t.index) if y_val.sum() > 0: all_ap[val_key] = average_precision_score(y_val, all_probs[val_key]) all_rocs[val_key] = roc_auc_score(y_val, all_probs[val_key]) if y_test.sum() > 0: all_ap[test_key] = average_precision_score(y_test, all_probs[test_key]) all_rocs[test_key] = roc_auc_score(y_test, all_probs[test_key]) print(f'{oo}, rep {rep}, AP (val, test): {all_ap.get(val_key, np.nan):.3f}, {all_ap.get(test_key, np.nan):.3f}') print(f'\tROC (val, test): {all_rocs.get(val_key, np.nan):.3f}, {all_rocs.get(test_key, np.nan):.3f}') eval_scores.append(all_rocs.get(val_key, np.nan)) test_scores.append(all_rocs.get(test_key, np.nan)) eval_avg = np.mean(eval_scores) eval_std = np.std(eval_scores, ddof=1) test_avg = np.mean(test_scores) test_std = np.std(test_scores, ddof=1) print("eval: ", eval_scores) print("test: ", test_scores) print("%s | eval and test: %.4f (%.4f),%.4f (%.4f)" % (args.dataset_name, eval_avg, eval_std, test_avg, test_std))
worker = mp_reader.multiprocess_reader(worker_pool, use_pipe=True, queue_size=1000) r = paddle.reader.buffered(worker, self.buf_size) for batch in r(): yield batch def scan(self): """scan""" for example in self.dataset: yield example if __name__ == "__main__": from base_dataset import BaseDataset, Subset dataset = GraphPropPredDataset(name="ogbg-molhiv") splitted_index = dataset.get_idx_split() train_dataset = Subset(dataset, splitted_index['train']) valid_dataset = Subset(dataset, splitted_index['valid']) test_dataset = Subset(dataset, splitted_index['test']) log.info("Train Examples: %s" % len(train_dataset)) log.info("Val Examples: %s" % len(valid_dataset)) log.info("Test Examples: %s" % len(test_dataset)) # train_loader = GraphDataloader(train_dataset, batch_size=3) # for batch_data in train_loader: # graphs, labels = batch_data # print(labels.shape) # time.sleep(4)
return x_embedding class BondEncoder(nn.Layer): def __init__(self, emb_dim): super(BondEncoder, self).__init__() self.bond_embedding_list = nn.LayerList() for i, dim in enumerate(full_bond_feature_dims): weight_attr = nn.initializer.XavierUniform() emb = paddle.nn.Embedding(dim, emb_dim, weight_attr=weight_attr) self.bond_embedding_list.append(emb) def forward(self, edge_attr): bond_embedding = 0 for i in range(edge_attr.shape[1]): bond_embedding += self.bond_embedding_list[i](edge_attr[:, i]) return bond_embedding if __name__ == '__main__': from ogb.graphproppred import GraphPropPredDataset dataset = GraphPropPredDataset(name='ogbg-molpcba') atom_enc = AtomEncoder(100) bond_enc = BondEncoder(100) print(atom_enc(dataset[0].x)) print(bond_enc(dataset[0].edge_attr))
def test_datasetsaver(): # test on graph classification # ogbg-molhiv test_task = 'link' # testing all the dataset objects are working. if test_task == 'graph': from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = GraphPropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'node': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-arxiv' # test ogbn-proteins dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'link': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-collab' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() elif test_task == 'heteronode': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-mag' dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'heterolink': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-biokg' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() else: raise ValueError('Invalid task category') print(dataset[0]) if 'link' in test_task: print(dataset.get_edge_split()) else: print(dataset.get_idx_split()) if 'graph' in test_task: graph_list = dataset.graphs else: graph_list = [dataset.graph] if 'link' not in test_task: labels = dataset.labels is_hetero = 'hetero' in test_task version = 2 if dataset_name == 'ogbn-mag' else 1 saver = DatasetSaver(dataset_name, is_hetero, version=version) # saving graph objects saver.save_graph_list(graph_list) # saving target labels if 'link' not in test_task: saver.save_target_labels(labels) # saving split if 'link' in test_task: split_idx = dataset.get_edge_split() else: split_idx = dataset.get_idx_split() # second argument must be the name of the split saver.save_split(split_idx, dataset.meta_info['split']) # copying mapping dir # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/") saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join( dataset_name.split('-')))) saver.save_task_info( dataset.task_type, dataset.eval_metric, dataset.num_classes if hasattr(dataset, 'num_classes') else None) meta_dict = saver.get_meta_dict() print(meta_dict) print('Now testing.') if 'graph' in test_task: print('library agnostic') dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'node' in test_task: print('library agnostic') dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'link' in test_task: print('library agnostic') dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('Pytorch Geometric') dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('DGL') dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) else: raise ValueError('Invalid task category') # zip saver.zip() print('Finished zipping!') saver.cleanup()
def main(config): if dist.get_world_size() > 1: dist.init_parallel_env() if dist.get_rank() == 0: timestamp = datetime.now().strftime("%Hh%Mm%Ss") log_path = os.path.join(config.log_dir, "tensorboard_log_%s" % timestamp) writer = SummaryWriter(log_path) log.info("loading data") raw_dataset = GraphPropPredDataset(name=config.dataset_name) config.num_class = raw_dataset.num_tasks config.eval_metric = raw_dataset.eval_metric config.task_type = raw_dataset.task_type mol_dataset = MolDataset(config, raw_dataset, transform=make_multihop_edges) splitted_index = raw_dataset.get_idx_split() train_ds = Subset(mol_dataset, splitted_index['train'], mode='train') valid_ds = Subset(mol_dataset, splitted_index['valid'], mode="valid") test_ds = Subset(mol_dataset, splitted_index['test'], mode="test") log.info("Train Examples: %s" % len(train_ds)) log.info("Val Examples: %s" % len(valid_ds)) log.info("Test Examples: %s" % len(test_ds)) fn = CollateFn(config) train_loader = Dataloader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, collate_fn=fn) valid_loader = Dataloader(valid_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) test_loader = Dataloader(test_ds, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=fn) model = ClassifierNetwork(config.hidden_size, config.out_dim, config.num_layers, config.dropout_prob, config.virt_node, config.K, config.conv_type, config.appnp_hop, config.alpha) model = paddle.DataParallel(model) optim = Adam(learning_rate=config.lr, parameters=model.parameters()) criterion = nn.loss.BCEWithLogitsLoss() evaluator = Evaluator(config.dataset_name) best_valid = 0 global_step = 0 for epoch in range(1, config.epochs + 1): model.train() for idx, batch_data in enumerate(train_loader): g, mh_graphs, labels, unmask = batch_data g = g.tensor() multihop_graphs = [] for item in mh_graphs: multihop_graphs.append(item.tensor()) g.multi_hop_graphs = multihop_graphs labels = paddle.to_tensor(labels) unmask = paddle.to_tensor(unmask) pred = model(g) pred = paddle.masked_select(pred, unmask) labels = paddle.masked_select(labels, unmask) train_loss = criterion(pred, labels) train_loss.backward() optim.step() optim.clear_grad() if global_step % 80 == 0: message = "train: epoch %d | step %d | " % (epoch, global_step) message += "loss %.6f" % (train_loss.numpy()) log.info(message) if dist.get_rank() == 0: writer.add_scalar("loss", train_loss.numpy(), global_step) global_step += 1 valid_result = evaluate(model, valid_loader, criterion, evaluator) message = "valid: epoch %d | step %d | " % (epoch, global_step) for key, value in valid_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("valid_%s" % key, value, global_step) log.info(message) test_result = evaluate(model, test_loader, criterion, evaluator) message = "test: epoch %d | step %d | " % (epoch, global_step) for key, value in test_result.items(): message += " | %s %.6f" % (key, value) if dist.get_rank() == 0: writer.add_scalar("test_%s" % key, value, global_step) log.info(message) if best_valid < valid_result[config.metrics]: best_valid = valid_result[config.metrics] best_valid_result = valid_result best_test_result = test_result message = "best result: epoch %d | " % (epoch) message += "valid %s: %.6f | " % (config.metrics, best_valid_result[config.metrics]) message += "test %s: %.6f | " % (config.metrics, best_test_result[config.metrics]) log.info(message) message = "final eval best result:%.6f" % best_valid_result[config.metrics] log.info(message) message = "final test best result:%.6f" % best_test_result[config.metrics] log.info(message)
def train(args, pretrained_model_config=None): log.info("loading data") raw_dataset = GraphPropPredDataset(name=args.dataset_name) args.num_class = raw_dataset.num_tasks args.eval_metric = raw_dataset.eval_metric args.task_type = raw_dataset.task_type train_ds = MolDataset(args, raw_dataset) args.eval_steps = math.ceil(len(train_ds) / args.batch_size) log.info("Total %s steps (eval_steps) every epoch." % (args.eval_steps)) fn = MgfCollateFn(args) train_loader = Dataloader(train_ds, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=args.shuffle, stream_shuffle_size=args.shuffle_size, collate_fn=fn) # for evaluating eval_train_loader = train_loader eval_train_loader = PDataset.from_generator_func(eval_train_loader) train_loader = multi_epoch_dataloader(train_loader, args.epochs) train_loader = PDataset.from_generator_func(train_loader) if args.warm_start_from is not None: # warm start setting def _fn(v): if not isinstance(v, F.framework.Parameter): return False if os.path.exists(os.path.join(args.warm_start_from, v.name)): return True else: return False ws = propeller.WarmStartSetting(predicate_fn=_fn, from_dir=args.warm_start_from) else: ws = None def cmp_fn(old, new): if old['eval'][args.metrics] - new['eval'][args.metrics] > 0: log.info("best %s eval result: %s" % (args.metrics, new['eval'])) return True else: return False if args.log_id is not None: save_best_model = int(args.log_id) == 5 else: save_best_model = True best_exporter = propeller.exporter.BestResultExporter( args.output_dir, (cmp_fn, save_best_model)) eval_datasets = {"eval": eval_train_loader} propeller.train.train_and_eval( model_class_or_model_fn=MgfModel, params=pretrained_model_config, run_config=args, train_dataset=train_loader, eval_dataset=eval_datasets, warm_start_setting=ws, exporters=[best_exporter], )
labels = np.array(labels) batch_valid = (labels == labels).astype("bool") labels = np.nan_to_num(labels).astype("float32") g = pgl.Graph.batch(graph_list) multihop_graphs = [] for g_list in multihop_graph_list: multihop_graphs.append(pgl.Graph.batch(g_list)) return g, multihop_graphs, labels, batch_valid if __name__ == "__main__": config = prepare_config("pcba_config.yaml", isCreate=False, isSave=False) raw_dataset = GraphPropPredDataset(name=config.dataset_name) ds = MolDataset(config, raw_dataset, transform=make_multihop_edges) splitted_index = raw_dataset.get_idx_split() train_ds = Subset(ds, splitted_index['train'], mode='train') valid_ds = Subset(ds, splitted_index['valid'], mode="valid") test_ds = Subset(ds, splitted_index['test'], mode="test") Fn = CollateFn(config) loader = Dataloader(train_ds, batch_size=3, shuffle=False, num_workers=4, collate_fn=Fn) for batch_data in loader: print("batch", batch_data[0][0].node_feat) g = pgl.Graph.batch(batch_data[0])