def main(args): """tbd""" model_config = json.load(open(args.model_config, 'r')) model_config['context_pooling'] = args.context_pooling ### build model train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward() opt = fluid.optimizer.Adam(learning_rate=args.lr) if args.distributed: opt = get_distributed_optimizer(opt) opt.minimize(model.loss) with fluid.program_guard(test_prog, fluid.Program()): with fluid.unique_name.guard(): model = PreGNNContextpredModel(model_config) model.forward(is_test=True) place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \ if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_model is None and not args.init_model == "": load_partial_params(exe, args.init_model, train_prog) ### load data k = model_config['layer_num'] l1 = k - 1 l2 = l1 + args.context_size featurizer = PreGNNContextPredFeaturizer( model.substruct_graph_wrapper, model.context_graph_wrapper, k, l1, l2) dataset = load_zinc_dataset(args.data_path, featurizer=featurizer) splitter = RandomSplitter() train_dataset, _, test_dataset = splitter.split( dataset, frac_train=0.9, frac_valid=0, frac_test=0.1) if args.distributed: indices = list(range(fleet.worker_num(), len(train_dataset), fleet.worker_index())) train_dataset = train_dataset[indices] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) ### start train list_test_loss = [] for epoch_id in range(args.max_epoch): train_loss = train(args, exe, train_prog, model, train_dataset, featurizer) test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer) if not args.distributed or fleet.worker_index() == 0: fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) list_test_loss.append(test_loss) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) if not args.distributed or fleet.worker_index() == 0: best_epoch_id = np.argmax(list_test_loss) fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog) fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog) return list_test_loss[best_epoch_id]
def main(args): """ Call the configuration function of the model, build the model and load data, then start training. model_config: a json file with the model configurations,such as dropout rate ,learning rate,num tasks and so on; lr: It means the learning rate of different optimizer; PreGNNAttrmaskModel: It is an unsupervised pretraining model which randomly masks the atom type of some node and then use the masked atom type as the prediction targets. """ model_config = json.load(open(args.model_config, 'r')) if not args.dropout_rate is None: model_config['dropout_rate'] = args.dropout_rate ### build model train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = PreGNNAttrmaskModel(model_config) model.forward() opt = fluid.optimizer.Adam(learning_rate=args.lr) if args.distributed: opt = get_distributed_optimizer(opt) opt.minimize(model.loss) with fluid.program_guard(test_prog, fluid.Program()): with fluid.unique_name.guard(): model = PreGNNAttrmaskModel(model_config) model.forward(is_test=True) # Use CUDAPlace for GPU training, or use CPUPlace for CPU training. place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \ if args.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if not args.init_model is None and not args.init_model == "": load_partial_params(exe, args.init_model, train_prog) ### load data # PreGNNAttrMaskFeaturizer: # It is used along with `PreGNNAttrmaskModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch. # splitter: # split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit. # `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, # then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set # and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on # out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` # and `IndexSplitter` is also available." featurizer = PreGNNAttrMaskFeaturizer(model.graph_wrapper, atom_type_num=len( CompoundConstants.atom_num_list), mask_ratio=args.mask_ratio) dataset = load_zinc_dataset(args.data_path, featurizer=featurizer) splitter = RandomSplitter() train_dataset, _, test_dataset = splitter.split(dataset, frac_train=0.9, frac_valid=0, frac_test=0.1) if args.distributed: indices = list( range(fleet.worker_index(), len(train_dataset), fleet.worker_num())) train_dataset = train_dataset[indices] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) ### start train # Load the train function and calculate the train loss in each epoch. # Here we set the epoch is in range of max epoch,you can change it if you want. # Then we will calculate the train loss ,test loss and print them. # Finally we save the best epoch to the model according to the dataset. list_test_loss = [] for epoch_id in range(args.max_epoch): train_loss = train(args, exe, train_prog, model, train_dataset, featurizer) test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer) if not args.distributed or fleet.worker_index() == 0: fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog) list_test_loss.append(test_loss) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) if not args.distributed or fleet.worker_index() == 0: best_epoch_id = np.argmin(list_test_loss) fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog) fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog) return list_test_loss[best_epoch_id]