Exemplo n.º 1
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Inference with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='path result directory of training')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    if method == 'schnet':
        raise ValueError('Currently SchNet does not support prediction.')

    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, _, test = data.load_dataset(method, labels)

    predictor_ = predictor.build_predictor(method, config['unit_num'],
                                           config['conv_layers'], class_num)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        predictor_.to_gpu()

    inference_loop = predictor.InferenceLoop(predictor_)

    y_pred = inference_loop.inference(test)
    numpy.save('prediction.npy', y_pred)
Exemplo n.º 2
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Inference with a trained model.')
    parser.add_argument('--in-dir', '-i', type=str, default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--trainer-snapshot', '-s', type=str, default='',
                        help='Path to the snapshot file of the Chainer '
                        'trainer from which serialized model parameters '
                        'are extracted. If it is not specified, this '
                        'script searches the training result directory '
                        'for the latest snapshot, assuming that '
                        'the naming convension of snapshot files is '
                        '`snapshot_iter_N` where N is the number of '
                        'iterations, which is the default configuration '
                        'of Chainer.')
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    if method == 'schnet':
        raise ValueError('Currently SchNet does not support prediction.')

    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, test, _ = data.load_dataset(method, labels)
    test = test.get_datasets()
    X_test = D.NumpyTupleDataset(*test[:-1])
    y_test = test[-1]

    # Load pretrained model
    predictor_ = predictor.build_predictor(
        method, config['unit_num'], config['conv_layers'], class_num)
    snapshot_file = args.trainer_snapshot
    if not snapshot_file:
        snapshot_file = _find_latest_snapshot(args.in_dir)
    print('Loading pretrained model parameters from {}'.format(snapshot_file))
    chainer.serializers.load_npz(snapshot_file,
                                 predictor_, 'updater/model:main/predictor/')

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        predictor_.to_gpu()

    inference_loop = predictor.InferenceLoop(predictor_)
    y_pred = inference_loop.inference(X_test)
    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]
        y_test = y_test[:, None]

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'
          .format('TaskID', 'Correct', 'Total', 'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'
              .format(idx, n_correct, n_total, accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)
Exemplo n.º 3
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'attention']
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    print("labels : ", labels)
    print("num_data : ", args.num_data)
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)
    #train data : 11757
    #len(train[0]) : 3
    #len(train[0][0]) : 34
    #len(train[0][1]) : 34
    #len(train[0][1][0]) : 34
    #len(train[0][1]) : 12 ?
    #import pdb;pdb.set_trace()
    #Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
                'validation/main/accuracy', 'elapsed_time'
            ]))
    elif eval_mode == 1:
        train_eval_iter = I.SerialIterator(train,
                                           args.batchsize,
                                           repeat=False,
                                           shuffle=False)
        trainer.extend(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=predictor_,
                            device=args.gpu,
                            converter=concat_mols,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=predictor_,
                            device=args.gpu,
                            converter=concat_mols,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))
        trainer.extend(
            E.PrintReport([
                'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
                'validation/main/loss', 'validation/main/accuracy',
                'val/main/roc_auc', 'elapsed_time'
            ]))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Exemplo n.º 4
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels)

    # Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)
    classifier = L.Classifier(predictor_,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))
Exemplo n.º 5
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'nfpdrop', 'ggnndrop']
    label_names = D.get_tox21_label_names() + ['pyridine']
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--n-layers',
                        type=int,
                        default=1,
                        help='number of mlp layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--dropout-ratio',
                        '-d',
                        type=float,
                        default=0.25,
                        help='dropout_ratio')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--num-train',
                        type=int,
                        default=-1,
                        help='number of training data to be used, '
                        'negative value indicates use all train data')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(
        method, labels)

    num_train = args.num_train  # 100
    if num_train > 0:
        # reduce size of train data
        seed = args.seed  # 0
        np.random.seed(seed)
        train_selected_label = np.random.permutation(np.arange(
            len(train)))[:num_train]
        print('num_train', num_train, len(train_selected_label),
              train_selected_label)
        train = NumpyTupleDataset(*train.features[train_selected_label, :])
    # Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num,
                                           args.dropout_ratio, args.n_layers)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)
    classifier = L.Classifier(predictor_,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.LogReport())

    # --- ROCAUC Evaluator ---
    train_eval_iter = I.SerialIterator(train,
                                       args.batchsize,
                                       repeat=False,
                                       shuffle=False)
    trainer.extend(
        ROCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train'))
    trainer.extend(
        ROCAUCEvaluator(val_iter,
                        classifier,
                        eval_func=predictor_,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val'))
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time'
        ]))

    trainer.extend(E.ProgressBar(update_interval=10))
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    with open(os.path.join(args.out, 'args.json'), 'w') as f:
        json.dump(vars(args), f, indent=4)
    chainer.serializers.save_npz(os.path.join(args.out, 'predictor.npz'),
                                 predictor_)
Exemplo n.º 6
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat']
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        default='nfp', help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label', '-l', type=str, choices=label_names,
                        default='', help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type', type=str, choices=iterator_type,
                        default='serial', help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode', type=int, default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to output directory')
    parser.add_argument('--epoch', '-e', type=int, default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume', '-r', type=str, default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol', type=int, default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename', type=str, default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data', type=int, default=-1,
                        help='Number of data to be parsed from parser.'
                             '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)

    # Network
    predictor_ = predictor.build_predictor(
        method, args.unit_num, args.conv_layers, class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(
            train, args.batchsize, train.features[:, -1], ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))
    val_iter = I.SerialIterator(val, args.batchsize,
                                repeat=False, shuffle=False)

    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=args.gpu)

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(
        train_iter, optimizer, device=args.gpu, converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(E.Evaluator(val_iter, classifier,
                               device=args.gpu, converter=concat_mols))
    trainer.extend(E.LogReport())

    eval_mode = args.eval_mode
    if eval_mode == 0:
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time']))
    elif eval_mode == 1:
        train_eval_iter = I.SerialIterator(train, args.batchsize,
                                           repeat=False, shuffle=False)
        trainer.extend(ROCAUCEvaluator(
            train_eval_iter, classifier, eval_func=predictor_,
            device=args.gpu, converter=concat_mols, name='train',
            pos_labels=1, ignore_labels=-1, raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(ROCAUCEvaluator(
            val_iter, classifier, eval_func=predictor_,
            device=args.gpu, converter=concat_mols, name='val',
            pos_labels=1, ignore_labels=-1))
        trainer.extend(E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'train/main/roc_auc',
            'validation/main/loss', 'validation/main/accuracy',
            'val/main/roc_auc', 'elapsed_time']))
    else:
        raise ValueError('Invalid accfun_mode {}'.format(eval_mode))
    trainer.extend(E.ProgressBar(update_interval=10))
    frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()

    config = {'method': args.method,
              'conv_layers': args.conv_layers,
              'unit_num': args.unit_num,
              'labels': args.label}

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Exemplo n.º 7
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--trainer-snapshot',
                        '-s',
                        type=str,
                        default='',
                        help='Path to the snapshot file of the Chainer '
                        'trainer from which serialized model parameters '
                        'are extracted. If it is not specified, this '
                        'script searches the training result directory '
                        'for the latest snapshot, assuming that '
                        'the naming convension of snapshot files is '
                        '`snapshot_iter_N` where N is the number of '
                        'iterations, which is the default configuration '
                        'of Chainer.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, test, _ = data.load_dataset(method, labels)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    predictor_ = predictor.build_predictor(method, config['unit_num'],
                                           config['conv_layers'], class_num)
    snapshot_file = args.trainer_snapshot
    if not snapshot_file:
        snapshot_file = _find_latest_snapshot(args.in_dir)
    print('Loading pretrained model parameters from {}'.format(snapshot_file))
    chainer.serializers.load_npz(snapshot_file, predictor_,
                                 'updater/model:main/predictor/')

    clf = Classifier(predictor=predictor_,
                     device=args.gpu,
                     lossfun=F.sigmoid_cross_entropy,
                     metrics_fun=F.binary_accuracy)

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=predictor_,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
Exemplo n.º 8
0
def main():
    # Supported preprocessing/network list
    method_list = [
        'nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat'
    ]
    label_names = D.get_tox21_label_names()
    iterator_type = ['serial', 'balanced']

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--iterator-type',
                        type=str,
                        choices=iterator_type,
                        default='serial',
                        help='iterator type. If `balanced` '
                        'is specified, data is sampled to take same number of'
                        'positive/negative labels during training.')
    parser.add_argument('--eval-mode',
                        type=int,
                        default=1,
                        help='Evaluation mode.'
                        '0: only binary_accuracy is calculated.'
                        '1: binary_accuracy and ROC-AUC score is calculated')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument(
        '--device',
        type=str,
        default='-1',
        help='Device specifier. Either ChainerX device specifier or an '
        'integer. If non-negative integer, CuPy arrays with specified '
        'device id are used. If negative integer, NumPy arrays are used')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--resume',
                        '-r',
                        type=str,
                        default='',
                        help='path to a trainer snapshot')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--protocol',
                        type=int,
                        default=2,
                        help='protocol version for pickle')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)

    # Network
    predictor_ = set_up_predictor(method, args.unit_num, args.conv_layers,
                                  class_num)

    iterator_type = args.iterator_type
    if iterator_type == 'serial':
        train_iter = I.SerialIterator(train, args.batchsize)
    elif iterator_type == 'balanced':
        if class_num > 1:
            raise ValueError('BalancedSerialIterator can be used with only one'
                             'label classification, please specify label to'
                             'be predicted by --label option.')
        train_iter = BalancedSerialIterator(train,
                                            args.batchsize,
                                            train.features[:, -1],
                                            ignore_labels=-1)
        train_iter.show_label_stats()
    else:
        raise ValueError('Invalid iterator type {}'.format(iterator_type))

    device = chainer.get_device(args.device)
    classifier = Classifier(predictor_,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=F.binary_accuracy,
                            device=device)

    extensions_list = []
    eval_mode = args.eval_mode
    if eval_mode == 1:
        train_eval_iter = I.SerialIterator(train,
                                           args.batchsize,
                                           repeat=False,
                                           shuffle=False)

        extensions_list.append(
            ROCAUCEvaluator(train_eval_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='train',
                            pos_labels=1,
                            ignore_labels=-1,
                            raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        val_iter = I.SerialIterator(val,
                                    args.batchsize,
                                    repeat=False,
                                    shuffle=False)
        extensions_list.append(
            ROCAUCEvaluator(val_iter,
                            classifier,
                            eval_func=predictor_,
                            device=device,
                            converter=concat_mols,
                            name='val',
                            pos_labels=1,
                            ignore_labels=-1))

    run_train(classifier,
              train_iter,
              valid=val,
              batch_size=args.batchsize,
              epoch=args.epoch,
              out=args.out,
              device=device,
              converter=concat_mols,
              extensions_list=extensions_list,
              resume_path=args.resume)

    # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)
    # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))
    # trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }
    save_json(os.path.join(args.out, 'config.json'), config)

    classifier.save_pickle(os.path.join(args.out, args.model_filename),
                           protocol=args.protocol)
Exemplo n.º 9
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet']
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Multitask Learning with Tox21.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp',
                        help='graph convolution model to use '
                        'as a predictor.')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for logistic '
                        'regression. Use all labels if this option '
                        'is not specified.')
    parser.add_argument('--conv-layers',
                        '-c',
                        type=int,
                        default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--out',
                        '-o',
                        type=str,
                        default='result',
                        help='path to output directory')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num',
                        '-u',
                        type=int,
                        default=16,
                        help='number of units in one layer of the model')
    args = parser.parse_args()

    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        class_num = len(label_names)

    # Dataset preparation
    train, val, _ = data.load_dataset(method, labels)

    # Network
    predictor_ = predictor.build_predictor(method, args.unit_num,
                                           args.conv_layers, class_num)

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)
    classifier = L.Classifier(predictor_,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar(update_interval=10))
    trainer.run()

    config = {
        'method': args.method,
        'conv_layers': args.conv_layers,
        'unit_num': args.unit_num,
        'labels': args.label
    }

    with open(os.path.join(args.out, 'config.json'), 'w') as o:
        o.write(json.dumps(config))