def main(args):
    """tbd"""
    model_config = json.load(open(args.model_config, 'r'))
    model_config['context_pooling'] = args.context_pooling

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNContextpredModel(model_config)
            model.forward(is_test=True)

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    k = model_config['layer_num']
    l1 = k - 1
    l2 = l1 + args.context_size
    featurizer = PreGNNContextPredFeaturizer(
            model.substruct_graph_wrapper, 
            model.context_graph_wrapper, 
            k, l1, l2)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(
            dataset, frac_train=0.9, frac_valid=0, frac_test=0.1)
    if args.distributed:
        indices = list(range(fleet.worker_num(), len(train_dataset), fleet.worker_index()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset, featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset, featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id), train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmax(list_test_loss)
        fluid.io.load_params(exe, '%s/epoch%d' % (args.model_dir, best_epoch_id), train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir), train_prog)
        return list_test_loss[best_epoch_id]
예제 #2
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.

    model_config:
        a json file  with the  model configurations,such as dropout rate ,learning rate,num tasks and so on;

    lr:
        It means the learning rate of different optimizer;
    
    PreGNNAttrmaskModel:
        It is an unsupervised pretraining model which randomly masks the atom type of some node and then use the masked atom type as the prediction targets.
       
    """
    model_config = json.load(open(args.model_config, 'r'))
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate

    ### build model
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = PreGNNAttrmaskModel(model_config)
            model.forward()
            opt = fluid.optimizer.Adam(learning_rate=args.lr)
            if args.distributed:
                opt = get_distributed_optimizer(opt)
            opt.minimize(model.loss)
    with fluid.program_guard(test_prog, fluid.Program()):
        with fluid.unique_name.guard():
            model = PreGNNAttrmaskModel(model_config)
            model.forward(is_test=True)

    # Use CUDAPlace for GPU training, or use CPUPlace for CPU training.
    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) \
            if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if not args.init_model is None and not args.init_model == "":
        load_partial_params(exe, args.init_model, train_prog)

    ### load data
    # PreGNNAttrMaskFeaturizer:
    #     It is used along with `PreGNNAttrmaskModel`. It inherits from the super class `Featurizer` which is used for feature extractions. The `Featurizer` has two functions: `gen_features` for converting from a single raw smiles to a single graph data, `collate_fn` for aggregating a sublist of graph data into a big batch.

    # splitter:
    #     split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
    #     `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold,
    #     then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set
    #     and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on
    #     out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter`
    #     and `IndexSplitter` is also available."
    featurizer = PreGNNAttrMaskFeaturizer(model.graph_wrapper,
                                          atom_type_num=len(
                                              CompoundConstants.atom_num_list),
                                          mask_ratio=args.mask_ratio)
    dataset = load_zinc_dataset(args.data_path, featurizer=featurizer)

    splitter = RandomSplitter()
    train_dataset, _, test_dataset = splitter.split(dataset,
                                                    frac_train=0.9,
                                                    frac_valid=0,
                                                    frac_test=0.1)
    if args.distributed:
        indices = list(
            range(fleet.worker_index(), len(train_dataset),
                  fleet.worker_num()))
        train_dataset = train_dataset[indices]
    print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset)))

    ### start train
    # Load the train function and calculate the train loss in each epoch.
    # Here we set the epoch is in range of max epoch,you can change it if you want.
    # Then we will calculate the train loss ,test loss and print them.
    # Finally we save the best epoch to the model according to the dataset.
    list_test_loss = []
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, exe, train_prog, model, train_dataset,
                           featurizer)
        test_loss = evaluate(args, exe, test_prog, model, test_dataset,
                             featurizer)
        if not args.distributed or fleet.worker_index() == 0:
            fluid.io.save_params(exe,
                                 '%s/epoch%s' % (args.model_dir, epoch_id),
                                 train_prog)
            list_test_loss.append(test_loss)
            print("epoch:%d train/loss:%s" % (epoch_id, train_loss))
            print("epoch:%d test/loss:%s" % (epoch_id, test_loss))

    if not args.distributed or fleet.worker_index() == 0:
        best_epoch_id = np.argmin(list_test_loss)
        fluid.io.load_params(exe,
                             '%s/epoch%d' % (args.model_dir, best_epoch_id),
                             train_prog)
        fluid.io.save_params(exe, '%s/epoch_best' % (args.model_dir),
                             train_prog)
        return list_test_loss[best_epoch_id]