def add_evaluator_if_needed(name,
                            evaluation_chain,
                            trainer,
                            args,
                            name_suffix=''):
    dataset_location = vars(args).get(name)
    if dataset_location is not None:
        data = load(dataset_location,
                    cache_path=args.cache,
                    normalize=True,
                    filter=True)
        data_iterator = MultithreadIterator(data,
                                            batch_size=args.eval_batch_size,
                                            shuffle=False,
                                            repeat=False,
                                            n_threads=2)
        evaluator = Evaluator(data_iterator,
                              evaluation_chain,
                              converter=ZeropadAsync(),
                              device=args.device)
        if args.only_eval_end:
            evaluator.trigger = (args.iterations, 'iteration')
        else:
            evaluator.trigger = LogscaleTrigger()
        if name_suffix != '':
            name = f'{name}/{name_suffix}'
        trainer.extend(evaluator, name=name)
示例#2
0
def run():
    config = get_config()
    print(yaml_dump(config))
    s = ""
    while not (s == "y" or s == "n"):
        s = input("ok? (y/n): ")
        if s == "n":
            destroy_config(config)

    device = config["device"][0] if isinstance(config["device"],
                                               list) else config["device"]

    Model = getattr(import_module(config["model"]["module"]),
                    config["model"]["class"])
    model = Model(**config["model"]["params"])

    Dataset = getattr(import_module(config["dataset"]["module"]),
                      config["dataset"]["class"])
    train_dataset = Dataset(**config["dataset"]["train"]["params"])
    valid_dataset = Dataset(**config["dataset"]["valid"]["params"])

    train_iterator = Iterator(train_dataset, config["batch"]["train"], True,
                              True)
    valid_iterator = Iterator(valid_dataset, config["batch"]["valid"], False,
                              False)

    Optimizer = getattr(import_module(config["optimizer"]["module"]),
                        config["optimizer"]["class"])
    optimizer = Optimizer(**config["optimizer"]["params"])
    optimizer.setup(model)
    for hook_config in config["optimizer"]["hook"]:
        Hook = getattr(import_module(hook_config["module"]),
                       hook_config["class"])
        hook = Hook(**hook_config["params"])
        optimizer.add_hook(hook)

    updater = Updater(train_iterator, optimizer, device=device)

    trainer = Trainer(updater, **config["trainer"]["params"])
    trainer.extend(snapshot_object(model, "model_iter_{.updater.iteration}"),
                   trigger=config["trainer"]["model_interval"])
    trainer.extend(observe_lr(), trigger=config["trainer"]["log_interval"])
    trainer.extend(
        LogReport([
            "epoch", "iteration", "main/loss", "validation/main/loss", "lr",
            "elapsed_time"
        ],
                  trigger=config["trainer"]["log_interval"]))
    trainer.extend(PrintReport([
        "epoch", "iteration", "main/loss", "validation/main/loss", "lr",
        "elapsed_time"
    ]),
                   trigger=config["trainer"]["log_interval"])
    trainer.extend(Evaluator(valid_iterator, model, device=device),
                   trigger=config["trainer"]["eval_interval"])
    trainer.extend(ProgressBar(update_interval=10))

    print("start training")
    trainer.run()
示例#3
0
def run_train():
    parser = ArgumentParser()
    parser.add_argument('--settings', type=str, required=True,
                        help='Path to the training settings ini file')

    settings = configparser.ConfigParser()
    settings.read(parser.parse_args().settings)

    # create model
    predictor = ResNet50Layers(None)
    model = Classifier(predictor)

    # use selected gpu by id
    gpu = settings.getint('hardware', 'gpu')
    if gpu >= 0:
        chainer.cuda.get_device_from_id(gpu).use()
        model.to_gpu()

    label_handler, train_dataset, val_dataset = _create_datasets(settings['input_data'])

    train_iter = chainer.iterators.SerialIterator(train_dataset, settings.getint('trainer', 'batchsize'))
    val_iter = chainer.iterators.SerialIterator(val_dataset, settings.getint('trainer', 'batchsize'), repeat=False)

    output_dir = '{}/training_{}_{}'.format(settings.get('output_data', 'path'), settings.get('trainer', 'epochs'), settings.get('optimizer', 'optimizer'))

    # optimizer
    optimizer = _create_optimizer(settings['optimizer'])
    optimizer.setup(model)

    # trainer
    updater = chainer.training.updater.StandardUpdater(train_iter, optimizer, device=gpu)
    trainer = chainer.training.Trainer(updater, (settings.getint('trainer', 'epochs'), 'epoch'), output_dir)

    trainer.extend(extensions.LogReport())
    trainer.extend(chainer.training.extensions.ProgressBar(update_interval=1))
    evaluator = Evaluator(val_iter, model, device=gpu)
    trainer.extend(evaluator)
    trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'], x_key='epoch', file_name='loss.png'))
    trainer.extend(extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], x_key='epoch', file_name='accuracy.png'))

    trainer.run()

    # save model
    output_file_path = '{0}/resnet.model'.format(output_dir)
    chainer.serializers.save_npz(output_file_path, predictor)

    meta_output = {
        'trainer': settings._sections['trainer'],
        'optimizer': settings._sections['optimizer'],
        'train_data': train_dataset.get_meta_info(label_handler),
        'validation_data': val_dataset.get_meta_info(label_handler),
    }

    with open('{0}/meta.json'.format(output_dir), 'w') as f:
        json.dump(meta_output, f, indent=4)
def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']

    # Load the standard scaler parameters, if necessary.
    if args.scale == 'standardize':
        with open(os.path.join(args.in_dir, 'scaler.pkl'), mode='rb') as f:
            scaler = pickle.load(f)
    else:
        scaler = None
    test = dataset

    print('Predicting...')
    # Set up the regressor.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
    scaled_predictor.scaler = scaler
    regressor.predictor = scaled_predictor

    # Perform the prediction.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()

    # Prevents the loss function from becoming a cupy.core.core.ndarray object
    # when using the GPU. This hack will be removed as soon as the cause of
    # the issue is found and properly fixed.
    loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
    eval_result['main/loss'] = loss
    print('Evaluation result: ', eval_result)

    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
    def chainer_model_pipe(self, nn, train, valid, params):
        epoch = params['epoch']
        batch_size = params['batch_size']
        use_gpu = params['use_gpu']

        if 'fixed_base_w' in params.keys():
            fixed_base_w = params['fixed_base_w']
        else:
            fixed_base_w = False

        # Model Instance
        model = L.Classifier(nn)

        if use_gpu:
            device = 0
            model.to_gpu(device)
        else:
            device = -1

        # ミニバッチのインスタンスを作成
        train_iter = SerialIterator(train, batch_size)
        valid_iter = SerialIterator(valid,
                                    batch_size,
                                    repeat=False,
                                    shuffle=False)

        # Set Lerning
        optimizer = Adam()
        optimizer.setup(model)

        if fixed_base_w:
            model.predictor.base.disable_update()

        updater = StandardUpdater(train_iter, optimizer, device=device)

        trainer = Trainer(updater, (epoch, 'epoch'), out='result/cat_dog')
        trainer.extend(Evaluator(valid_iter, model, device=device))
        trainer.extend(LogReport(trigger=(1, 'epoch')))
        trainer.extend(PrintReport([
            'epoch', 'main/accuracy', 'validation/main/accuracy', 'main/loss',
            'validation/main/loss', 'elapsed_time'
        ]),
                       trigger=(1, 'epoch'))

        trainer.run()

        if use_gpu:
            model.to_cpu()

        return model
def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']

    test = dataset

    print('Predicting...')
    # Set up the regressor.
    device = chainer.get_device(args.device)
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)

    # Perform the prediction.
    print('Evaluating...')
    converter = converter_method_dict[args.method]
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=converter,
                            device=device)()
    print('Evaluation result: ', eval_result)

    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
示例#7
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    device = args.gpu

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        original_t = dataset.get_datasets()[-1]
        if args.gpu >= 0:
            scaled_t = cuda.to_cpu(scaler.transform(
                cuda.to_gpu(original_t)))
        else:
            scaled_t = scaler.transform(original_t)

        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t,)))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(
        test, converter=extract_inputs,
        postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=device)[-1]
    original_t = cuda.to_cpu(scaler.inverse_transform(t))

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                        't_{}'.format(l): original_t[:, i], })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'
              .format(label_name, y_pred[:n_eval, target_label],
                      original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
                            device=device)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)


#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Proposed by Ishiguro
    # ToDo: consider go/no-go with following modification
    # Re-load the best-validation score snapshot
    serializers.load_npz(
        os.path.join(model_dir, "best_val_" + model_filename[task_type]),
        model)

    #    # Replace the default predictor with one that scales the output labels.
    #    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
    #    scaled_predictor.scaler = scaler
    #    model.predictor = scaled_predictor

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Proposed by Ishiguro: add more stats
    # ToDo: considre go/no-go with the following modification

    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

        save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)
    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(args.in_dir, 'eval_result.json'), rocauc_result)
    else:
        pass

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
示例#9
0
def main():
    experiment_name = "Stacked_16_16_16_16"
    snapshot_name = "snapshot_iter_27215"
    config_path = "/efs/fMRI_AE/{}/log/config.yml".format(experiment_name)
    config = load_config(config_path)
    config["additional information"]["mask"]["loader"]["params"][
        "mask_path"] = path.join(
            config["additional information"]["mask"]["directory"],
            config["additional information"]["mask"]["file"])
    config["additional information"]["mask"]["loader"]["params"][
        "crop"] = config["additional information"]["crop"]
    snapshot_path = "/efs/fMRI_AE/{}/model/{}".format(experiment_name,
                                                      snapshot_name)
    # print("configured as follows:")
    # print(yaml_dump(config))
    while True:
        s = input("ok? (y/n):")
        if s == 'y' or s == 'Y':
            log_config(config, "training start")
            break
        elif s == 'n' or s == 'N':
            destroy_config(config)
            exit(1)
    try:
        try:
            print("mask loading...")
            load_mask_module = import_module(
                config["additional information"]["mask"]["loader"]["module"],
                config["additional information"]["mask"]["loader"]["package"])
            load_mask = getattr(
                load_mask_module,
                config["additional information"]["mask"]["loader"]["function"])
            mask = load_mask(
                **config["additional information"]["mask"]["loader"]["params"])
            print("done.")
            print("mask.shape: {}".format(mask.shape))
        except FileNotFoundError as e:
            raise e

        model_module = import_module(config["model"]["module"],
                                     config["model"]["package"])
        Model = getattr(model_module, config["model"]["class"])
        model = Model(mask=mask, **config["model"]["params"])
        finetune_config = config["additional information"]["finetune"]
        if finetune_config is not None:
            load_npz(path.join(finetune_config["directory"],
                               finetune_config["file"]),
                     model,
                     strict=False)

        try:
            chainer.cuda.get_device_from_id(0).use()
            gpu = 0
            print("transferring model to GPU...")
            model.to_gpu(gpu)
            print("GPU enabled")
        except RuntimeError:
            gpu = -1
            print("GPU disabled")

        dataset_module = import_module(config["dataset"]["module"],
                                       config["dataset"]["package"])
        Dataset = getattr(dataset_module, config["dataset"]["class"])
        train_dataset = Dataset(**config["dataset"]["train"]["params"])
        valid_dataset = Dataset(**config["dataset"]["valid"]["params"])

        train_iterator = Iterator(train_dataset, config["batch"]["train"],
                                  True, True)
        valid_iterator = Iterator(valid_dataset, config["batch"]["valid"],
                                  False, False)

        Optimizer = getattr(chainer.optimizers, config["optimizer"]["class"])
        optimizer = Optimizer(**config["optimizer"]["params"])

        optimizer.setup(model)

        for hook_config in config["optimizer"]["hook"]:
            hook_module = import_module(hook_config["module"],
                                        hook_config["package"])
            Hook = getattr(hook_module, hook_config["class"])
            hook = Hook(**hook_config["params"])
            optimizer.add_hook(hook)

        updater = Updater(train_iterator, optimizer, device=gpu)

        trainer = Trainer(updater, **config["trainer"]["params"])
        trainer.extend(snapshot(),
                       trigger=config["trainer"]["snapshot_interval"])
        trainer.extend(snapshot_object(model,
                                       "model_iter_{.updater.iteration}"),
                       trigger=config["trainer"]["model_interval"])
        trainer.extend(observe_lr(), trigger=config["trainer"]["log_interval"])
        trainer.extend(
            LogReport([
                "epoch", "iteration", "main/loss", "main/pca_loss",
                "main/reconstruction_loss", "validation/main/loss"
            ],
                      trigger=config["trainer"]["log_interval"]))
        trainer.extend(Evaluator(valid_iterator, model, device=gpu),
                       trigger=config["trainer"]["eval_interval"])
        trainer.extend(PrintReport([
            "epoch", "iteration", "main/loss", "main/pca_loss",
            "main/reconstruction_loss", "validation/main/loss"
        ]),
                       trigger=config["trainer"]["log_interval"])
        trainer.extend(ProgressBar(update_interval=1))

        if "schedule" in config["additional information"].keys():
            for i, interval_funcs in enumerate(
                    config["additional information"]["schedule"].items()):
                interval, funcs = interval_funcs
                f = lambda trainer, funcs=funcs: [
                    trainer.updater.get_optimizer('main').target.
                    __getattribute__(func["function"])(*func["params"])
                    for func in funcs
                ]
                trainer.extend(f,
                               name="schedule_{}".format(i),
                               trigger=ManualScheduleTrigger(*interval))

        load_npz(snapshot_path, trainer)
        target = trainer.updater.get_optimizer("main").target
        target.reset_pca()
        target.attach_pca()
        ipca_param = np.load(
            "/efs/fMRI_AE/Stacked_8_8_8_8_feature/ipca_mean_7920_components_990_7920.npz"
        )
        target.pca.W = chainer.Parameter(ipca_param["components"])
        target.pca.bias = chainer.Parameter(ipca_param["mean"])
        target.pca.disable_update()
        target.pca.to_gpu(gpu)
        target.detach_pca_loss()
        target.attach_reconstruction_loss()
        target.release_decoder()
        target.freeze_encoder()

        trainer.run()
        log_config(config, "succeeded")

    except Exception as e:
        log_config(config, "unintentional termination")
        raise e
示例#10
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        scaled_t = scaler.transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    original_t = scaler.inverse_transform(t)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): original_t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format(
            label_name, y_pred[:n_eval, target_label],
            original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
示例#11
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

    # Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Re-load the best-validation score snapshot
    # serializers.load_npz(os.path.join(
    #     model_dir, "best_val_" + model_filename[task_type]), model)

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Add more stats
    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)
    else:
        print('[WARNING] unknown task_type {}.'.format(task_type))

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
示例#12
0
def main():
    config = get_config()
    # print("configured as follows:")
    # print(yaml_dump(config))
    while True:
        s = input("ok? (y/n):")
        if s == 'y' or s == 'Y':
            log_config(config, "training start")
            break
        elif s == 'n' or s == 'N':
            destroy_config(config)
            exit(1)
    try:
        try:
            print("mask loading...")
            load_mask_module = import_module(
                config["additional information"]["mask"]["loader"]["module"],
                config["additional information"]["mask"]["loader"]["package"])
            load_mask = getattr(
                load_mask_module,
                config["additional information"]["mask"]["loader"]["function"])
            mask = load_mask(
                **config["additional information"]["mask"]["loader"]["params"])
            print("done.")
            print("mask.shape: {}".format(mask.shape))
        except FileNotFoundError as e:
            raise e

        model_module = import_module(config["model"]["module"],
                                     config["model"]["package"])
        Model = getattr(model_module, config["model"]["class"])
        model = Model(mask=mask, **config["model"]["params"])
        finetune_config = config["additional information"][
            "finetune"] if "finetune" in config[
                "additional information"] else None
        if finetune_config is not None:
            load_npz(path.join(finetune_config["directory"],
                               finetune_config["file"]),
                     model,
                     strict=False)

        try:
            chainer.cuda.get_device_from_id(0).use()
            gpu = 0
            print("transferring model to GPU...")
            model.to_gpu(gpu)
            print("GPU enabled")
        except RuntimeError:
            gpu = -1
            print("GPU disabled")

        dataset_module = import_module(config["dataset"]["module"],
                                       config["dataset"]["package"])
        Dataset = getattr(dataset_module, config["dataset"]["class"])
        train_dataset = Dataset(**config["dataset"]["train"]["params"])
        valid_dataset = Dataset(**config["dataset"]["valid"]["params"])

        train_iterator = Iterator(train_dataset, config["batch"]["train"],
                                  True, True)
        valid_iterator = Iterator(valid_dataset, config["batch"]["valid"],
                                  False, False)

        Optimizer = getattr(chainer.optimizers, config["optimizer"]["class"])
        optimizer = Optimizer(**config["optimizer"]["params"])

        optimizer.setup(model)

        for hook_config in config["optimizer"]["hook"]:
            hook_module = import_module(hook_config["module"],
                                        hook_config["package"])
            Hook = getattr(hook_module, hook_config["class"])
            hook = Hook(**hook_config["params"])
            optimizer.add_hook(hook)

        updater = Updater(train_iterator, optimizer, device=gpu)

        trainer = Trainer(updater, **config["trainer"]["params"])
        trainer.extend(snapshot(),
                       trigger=config["trainer"]["snapshot_interval"])
        trainer.extend(snapshot_object(model,
                                       "model_iter_{.updater.iteration}"),
                       trigger=config["trainer"]["model_interval"])
        trainer.extend(observe_lr(), trigger=config["trainer"]["log_interval"])
        trainer.extend(
            LogReport(
                ["epoch", "iteration", "main/loss", "validation/main/loss"],
                trigger=config["trainer"]["log_interval"]))
        trainer.extend(Evaluator(valid_iterator, model, device=gpu),
                       trigger=config["trainer"]["eval_interval"])
        trainer.extend(PrintReport(
            ["epoch", "iteration", "main/loss", "validation/main/loss"]),
                       trigger=config["trainer"]["log_interval"])
        trainer.extend(ProgressBar(update_interval=1))

        if "schedule" in config["additional information"].keys():
            for i, interval_funcs in enumerate(
                    config["additional information"]["schedule"].items()):
                interval, funcs = interval_funcs
                f = lambda trainer, funcs=funcs: [
                    trainer.updater.get_optimizer('main').target.
                    __getattribute__(func["function"])(*func["params"])
                    for func in funcs
                ]
                trainer.extend(f,
                               name="schedule_{}".format(i),
                               trigger=ManualScheduleTrigger(*interval))
        trainer.run()
        log_config(config, "succeeded")

    except Exception as e:
        log_config(config, "unintentional termination")
        raise e
示例#13
0
    def main():
        # Parse the arguments.
        args = parse_arguments()
        theme_name = t_theme_name.get()

        args.model_folder_name = os.path.join(theme_name, 'chainer')
        #args.epoch = int(float(t_epochs.get()))
        args.out = parent_path / 'models' / theme_name / method_name
        args.method = method_name

        if args.label:
            labels = args.label
        else:
            raise ValueError('No target label was specified.')

        # Dataset preparation.
        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)

        smiles_col_name = t_smiles.get()
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        parser = CSVFileParser(preprocessor,
                               postprocess_label=postprocess_label,
                               labels=labels,
                               smiles_col=t_smiles.get())

        #args.datafile=parent_path / 'results' /  theme_name / method_name / high_low /'brics_virtual'  / 'virtual.csv'
        args.datafile = csv_path
        dataset = parser.parse(args.datafile)['dataset']

        @chainer.dataset.converter()
        def extract_inputs(batch, device=None):
            return concat_mols(batch, device=device)[:-1]

        print('Predicting the virtual library')
        # Set up the regressor.
        device = chainer.get_device(args.device)
        model_path = os.path.join(args.out, args.model_foldername,
                                  args.model_filename)

        with open(
                parent_path / 'models' / theme_name / method_name / high_low /
            ('regressor.pickle'), 'rb') as f:
            regressor = cloudpickle.loads(f.read())

        # Perform the prediction.
        print('Evaluating...')
        converter = converter_method_dict[args.method]
        data_iterator = SerialIterator(dataset,
                                       16,
                                       repeat=False,
                                       shuffle=False)
        eval_result = Evaluator(data_iterator,
                                regressor,
                                converter=converter,
                                device=device)()
        print('Evaluation result: ', eval_result)

        predict_ = regressor.predict(dataset, converter=extract_inputs)
        predict_ = [i[0] for i in predict_]
        df_data = pd.read_csv(csv_path)

        df_predict = df_data
        df_predict[t_task.get()] = predict_
        df_predict = df_predict.dropna()

        PandasTools.AddMoleculeColumnToFrame(frame=df_predict,
                                             smilesCol=t_smiles.get())
        df_predict['sascore'] = df_predict.ROMol.map(sascorer.calculateScore)

        df_predict.to_csv(csv_path)

        png_generator = (parent_path / 'results' / theme_name / method_name /
                         high_low / data_name /
                         'molecular-structure').glob('*.png')
        #png_generator.sort()

        for i, png_path in enumerate(png_generator):
            #print((png_path.name)[4:10])
            i = int((png_path.name)[4:10])
            if i < len(df_predict[t_task.get()]):
                img = Image.open(png_path)
                draw = ImageDraw.Draw(img)
                font = ImageFont.truetype('arial.ttf', 26)
                draw.text((0, 0),
                          t_task.get() + ' : ' +
                          str(round(df_predict[t_task.get()][i], 2)),
                          (0, 0, 0),
                          font=font)
                draw.text(
                    (0, 30),
                    'sascore : ' + str(round(df_predict['sascore'][i], 2)),
                    (0, 0, 0),
                    font=font)

                img.save(png_path)

        save_json(os.path.join(args.out, 'eval_result.json'), eval_result)
示例#14
0
    def main():
        # Parse the arguments.
        args = parse_arguments()

        args.model_folder_name = os.path.join(theme_name, 'chainer')

        base_epoch = complexity_degree[high_low]
        args.epoch = int(base_epoch * 60 / method_complexity[method_name])
        args.epoch = max(args.epoch, 5)

        #args.epoch = int(float(t_epochs.get()))
        args.out = parent_path / 'models' / theme_name / method_name / high_low
        args.method = method_name

        if t_model_path != "":
            args.source_transferlearning = Path(t_model_path.get())

        print(theme_name)

        if args.label:
            labels = args.label
            class_num = len(labels) if isinstance(labels, list) else 1
        else:
            raise ValueError('No target label was specified.')

        # Dataset preparation. Postprocessing is required for the regression task.
        def postprocess_label(label_list):
            return numpy.asarray(label_list, dtype=numpy.float32)

        # Apply a preprocessor to the dataset.
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        smiles_col_name = t_smiles.get()

        parser = CSVFileParser(preprocessor,
                               postprocess_label=postprocess_label,
                               labels=labels,
                               smiles_col=smiles_col_name)

        args.datafile = t_csv_filepath.get()
        dataset = parser.parse(args.datafile)['dataset']

        # Scale the label values, if necessary.
        if args.scale == 'standardize':
            scaler = StandardScaler()
            scaler.fit(dataset.get_datasets()[-1])
        else:
            scaler = None

        # Split the dataset into training and validation.
        train_data_size = int(len(dataset) * args.train_data_ratio)
        trainset, testset = split_dataset_random(dataset, train_data_size,
                                                 args.seed)

        print((args.source_transferlearning / method_name / high_low /
               'regressor.pickle'))
        print((args.source_transferlearning / method_name / high_low /
               'regressor.pickle').exists())

        # Set up the predictor.

        if  Booleanvar_transfer_learning.get() == True  \
              and (args.source_transferlearning / method_name / high_low /'regressor.pickle').exists() == True:

            # refer https://github.com/pfnet-research/chainer-chemistry/issues/407
            with open(
                    args.source_transferlearning / method_name / high_low /
                    'regressor.pickle', 'rb') as f:
                regressor = cloudpickle.loads(f.read())
                pre_predictor = regressor.predictor
                predictor = GraphConvPredictor(pre_predictor.graph_conv,
                                               MLP(out_dim=1, hidden_dim=16))

        else:
            predictor = set_up_predictor(args.method,
                                         args.unit_num,
                                         args.conv_layers,
                                         class_num,
                                         label_scaler=scaler)

        # Set up the regressor.
        device = chainer.get_device(args.device)
        metrics_fun = {'mae': functions.mean_absolute_error, 'rmse': rmse}

        regressor = Regressor(predictor,
                              lossfun=functions.mean_squared_error,
                              metrics_fun=metrics_fun,
                              device=device)

        print('Training... : ', method_name)
        run_train(regressor,
                  trainset,
                  valid=None,
                  batch_size=args.batchsize,
                  epoch=args.epoch,
                  out=args.out,
                  extensions_list=None,
                  device=device,
                  converter=concat_mols,
                  resume_path=None)

        # Save the regressor's parameters.
        args.model_foldername = t_theme_name.get()

        model_path = os.path.join(args.out, args.model_foldername,
                                  args.model_filename)
        print('Saving the trained model to {}...'.format(model_path))

        # TODO(nakago): ChainerX array cannot be sent to numpy array when internal
        # state has gradients.
        if hasattr(regressor.predictor.graph_conv, 'reset_state'):
            regressor.predictor.graph_conv.reset_state()

        with open(
                parent_path / 'models' / theme_name / method_name / high_low /
            ('regressor.pickle'), 'wb') as f:
            cloudpickle.dump(regressor, f)

        #with open(parent_path / 'models' / theme_name / method_name / high_low /('predictor.pickle'),  'wb') as f:
        #    cloudpickle.dump(predictor, f)

        print('Evaluating... : ', method_name)
        test_iterator = SerialIterator(testset,
                                       16,
                                       repeat=False,
                                       shuffle=False)
        eval_result = Evaluator(test_iterator,
                                regressor,
                                converter=concat_mols,
                                device=device)()
        print('Evaluation result: : ', method_name)
        print(eval_result)

        @chainer.dataset.converter()
        def extract_inputs(batch, device=None):
            return concat_mols(batch, device=device)[:-1]

        pred_train = regressor.predict(trainset, converter=extract_inputs)
        pred_train = [i[0] for i in pred_train]
        pred_test = regressor.predict(testset, converter=extract_inputs)
        pred_test = [i[0] for i in pred_test]

        y_train = [i[2][0] for i in trainset]
        y_test = [i[2][0] for i in testset]
        title = args.label
        save_path = parent_path / 'results' / theme_name / method_name / high_low / 'scatter.png'
        save_scatter(y_train, pred_train, y_test, pred_test, title, save_path)

        global image_score
        image_score_open = Image.open(parent_path / 'results' / theme_name /
                                      method_name / high_low / 'scatter.png')
        image_score = ImageTk.PhotoImage(image_score_open, master=frame1)

        canvas.create_image(200, 200, image=image_score)

        from sklearn.metrics import mean_squared_error, mean_absolute_error
        from sklearn.metrics import r2_score

        train_mse = mean_squared_error(y_train, pred_train)
        test_mse = mean_squared_error(y_test, pred_test)

        train_rmse = np.sqrt(train_mse)
        test_rmse = np.sqrt(test_mse)

        train_mae = mean_absolute_error(y_train, pred_train)
        test_mae = mean_absolute_error(y_test, pred_test)

        train_r2score = r2_score(y_train, pred_train)
        test_r2score = r2_score(y_test, pred_test)

        print('train_mse : ', train_mse)
        print('test_mse : ', test_mse)
        print('train_rmse : ', train_rmse)
        print('test_rmse : ', test_rmse)
        print('train_mae : ', train_mae)
        print('test_mae : ', train_mae)
        print('train_r2score : ', train_r2score)
        print('test_r2score : ', test_r2score)
示例#15
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    args.out = os.path.join(args.out, args.method)
    save_args(args, args.out)

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label_float(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)
    def postprocess_label_int(label_list):
        return numpy.asarray(label_list, dtype=numpy.int64)

    # Apply a preprocessor to the dataset.
    if args.train:
    ## training data
        fn,ext = os.path.splitext(args.train)
        if ext==".npz":
            print('Loading training dataset...')
            train = NumpyTupleDataset.load(args.train)
        else:
            print('Preprocessing training dataset...')
            preprocessor = preprocess_method_dict[args.method]()
            if args.classification:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
            else:
                parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
            train = parser.parse(args.train)['dataset']
            NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), train)        
        # Scale the label values, if necessary.
        if args.scale == 'standardize':
            scaler = StandardScaler()
            scaler.fit(train.get_datasets()[-1])
        else:
            scaler = None

    ## test data
    fn,ext = os.path.splitext(args.val)
    if ext==".npz":
        print('Loading test dataset...')
        test = NumpyTupleDataset.load(args.val)
    else:
        print('Preprocessing test dataset...')
        preprocessor = preprocess_method_dict[args.method]()
        if args.classification:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_int,labels=labels, smiles_col='SMILES')
        else:
            parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label_float,labels=labels, smiles_col='SMILES')
        test = parser.parse(args.val)['dataset']
        NumpyTupleDataset.save(os.path.join(args.out,os.path.split(fn)[1]), test)


    # Set up the model.
    device = chainer.get_device(args.device)
    converter = converter_method_dict[args.method]
    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
    if args.classification:
        if args.load_model:
            model = Classifier.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers, class_num)
            model = Classifier(predictor,
                                    lossfun=F.sigmoid_cross_entropy,
                                    metrics_fun=F.binary_accuracy,
                                    device=device)
    else:
        if args.load_model:
            model = Regressor.load_pickle(args.load_model, device=device)
            print("model file loaded: ",args.load_model)
        else:
            predictor = set_up_predictor(
                args.method+args.method_suffix, args.unit_num,
                args.conv_layers, class_num, label_scaler=scaler)
            model = Regressor(predictor, lossfun=F.mean_squared_error,
                            metrics_fun=metrics_fun, device=device)

    if args.train:
        if args.balanced_iter:
            train = BalancedSerialIterator(train, args.batchsize, train.features[:, -1], ignore_labels=-1)
            train.show_label_stats()
            
        print('Training...')
        log_keys = ['main/mae','main/rmse','validation/main/mae','validation/main/rmse','validation/main/roc_auc']
        extensions_list = [extensions.PlotReport(log_keys, 'iteration', trigger=(100, 'iteration'), file_name='loss.png')]
        if args.eval_roc and args.classification:
            extensions_list.append(ROCAUCEvaluator(
                        test, model, eval_func=predictor,
                        device=device, converter=converter, name='validation',
                        pos_labels=1, ignore_labels=-1, raise_value_error=False))

        save_json(os.path.join(args.out, 'args.json'), vars(args))
        run_train(model, train, valid=test,
                batch_size=args.batchsize, epoch=args.epoch,
                out=args.out, extensions_list=extensions_list,
                device=device, converter=converter) #, resume_path=args.resume)

        # Save the model's parameters.
        model_path = os.path.join(args.out, args.model_filename)
        print('Saving the trained model to {}...'.format(model_path))
        if hasattr(model.predictor.graph_conv, 'reset_state'):
            model.predictor.graph_conv.reset_state()
        model.save_pickle(model_path, protocol=args.protocol)

    ## prediction
    it = SerialIterator(test, args.batchsize, repeat=False, shuffle=False)
    result = []
    for batch in it:
        in_arrays = convert._call_converter(converter, batch, device)
        with chainer.using_config('train', False), chainer.function.no_backprop_mode():
            if isinstance(in_arrays, tuple):
                res = model(*in_arrays)
            elif isinstance(in_arrays, dict):
                res = model(**in_arrays)
            else:
                res = model(in_arrays)
        result.extend(model.y.array.get())

    numpy.savetxt(os.path.join(args.out,"result.csv"), numpy.array(result))

    eval_result = Evaluator(it, model, converter=converter,device=device)()
    print('Evaluation result: ', eval_result)
示例#16
0
def main():
    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    parser.add_argument('--model-filename',
                        type=str,
                        default='classifier.pkl',
                        help='file name for pickled model')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']

    _, test, _ = data.load_dataset(method, labels, num_data=args.num_data)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    clf = Classifier.load_pickle(os.path.join(args.in_dir,
                                              args.model_filename),
                                 device=args.gpu)  # type: Classifier

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=clf.predictor,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(rocauc_result, f)
示例#17
0
def main():

    # Parse arguments.
    with open('params.yml') as stream:
        args = yaml.load(stream)

    # Prepare training data.
    train, val = chainer.datasets.get_mnist(ndim=3)
    if args['memory'] == 'gpu' and 0 <= args['gpu']:
        train = [(cp.array(x), cp.array(y)) for x, y in train]
        val = [(cp.array(x), cp.array(y)) for x, y in val]

    # Prepare model.
    class Classifier(chainer.Chain):
        def __init__(self, predictor):
            super(Classifier, self).__init__()

            with self.init_scope():
                self.predictor = predictor

        def forward(self, batch, labels):
            embeddings = self.predictor(batch)
            loss = functions.batch_all_triplet_loss(
                embeddings,
                labels,
                margin=args['margin'],
                dist_type=args['dist_type'])
            chainer.reporter.report(
                {
                    'loss':
                    loss,
                    'VAL':
                    functions.validation_rate(embeddings,
                                              labels,
                                              threshold=args['threshold'],
                                              dist_type=args['dist_type']),
                    'FAR':
                    functions.false_accept_rate(embeddings,
                                                labels,
                                                threshold=args['threshold'],
                                                dist_type=args['dist_type'])
                }, self)
            return loss

    predictor = FaceNet()
    model = Classifier(predictor)
    if 0 <= args['gpu']:
        chainer.backends.cuda.get_device_from_id(args['gpu']).use()
        model.to_gpu()

    # Prepare optimizer.
    optimizer = chainer.optimizers.AdaDelta()
    optimizer.setup(model)

    # Make output directory.
    timestamp = f'{datetime.datetime.now():%Y%m%d%H%M%S}'
    directory = f'./temp/{timestamp}/'
    os.makedirs(directory, exist_ok=True)
    shutil.copy('params.yml', f'{directory}params.yml')

    # Prepare extensions.
    if args['memory'] == 'cpu' and 0 <= args['gpu']:

        def converter(batch, device=None, padding=None):
            return concat_examples([(cp.array(x), cp.array(y))
                                    for x, y in batch],
                                   device=device,
                                   padding=padding)
    else:
        converter = concat_examples

    class DumpEmbeddings(chainer.training.extension.Extension):
        def __init__(self, iterator, model, converter, filename):
            self.iterator = iterator
            self.model = model
            self.converter = converter
            self.filename = filename
            self.xp = cp if 0 <= args['gpu'] else np

        def __call__(self, trainer):
            if hasattr(self.iterator, 'reset'):
                self.iterator.reset()
                it = self.iterator
            else:
                it = copy.copy(self.iterator)

            def forward(batch):
                x, _ = self.converter(batch)
                y = self.model.predictor(x)
                embeddings = y.data
                if 0 <= args['gpu']:
                    embeddings = chainer.backends.cuda.to_cpu(embeddings)
                return embeddings

            embeddings = np.vstack([forward(batch) for batch in it])
            np.save(os.path.join(trainer.out, self.filename.format(trainer)),
                    embeddings)

    train_iter = SerialIterator(train, args['batch_size'])
    test_iter = SerialIterator(val,
                               args['batch_size'],
                               repeat=False,
                               shuffle=False)
    updater = StandardUpdater(train_iter, optimizer, converter=converter)
    trainer = Trainer(updater,
                      stop_trigger=(args['epochs'], 'epoch'),
                      out=directory)
    trainer.extend(dump_graph('main/loss', out_name='model.dot'))
    trainer.extend(Evaluator(test_iter, model, converter=converter))
    trainer.extend(snapshot_object(target=model,
                                   filename='model-{.updater.epoch:04d}.npz'),
                   trigger=(args['checkpoint_interval'], 'epoch'))
    trainer.extend(LogReport(log_name='log'))
    trainer.extend(
        PlotReport(y_keys=['main/loss', 'validation/main/loss'],
                   x_key='epoch',
                   file_name='loss.png'))
    trainer.extend(
        PlotReport(y_keys=['main/VAL', 'validation/main/VAL'],
                   x_key='epoch',
                   file_name='VAL.png'))
    trainer.extend(
        PlotReport(y_keys=['main/FAR', 'validation/main/FAR'],
                   x_key='epoch',
                   file_name='FAR.png'))
    trainer.extend(
        PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/VAL',
            'validation/main/VAL', 'main/FAR', 'validation/main/FAR',
            'elapsed_time'
        ]))
    trainer.extend(DumpEmbeddings(test_iter,
                                  model,
                                  converter=converter,
                                  filename='embeddings-{.updater.epoch}.npy'),
                   trigger=(args['checkpoint_interval'], 'epoch'))
    trainer.extend(ProgressBar(update_interval=1))

    # Execute training.
    trainer.run()
示例#18
0
def main():
    label_names = D.get_tox21_label_names()

    parser = argparse.ArgumentParser(
        description='Predict with a trained model.')
    parser.add_argument('--in-dir',
                        '-i',
                        type=str,
                        default='result',
                        help='Path to the result directory of the training '
                        'script.')
    parser.add_argument('--trainer-snapshot',
                        '-s',
                        type=str,
                        default='',
                        help='Path to the snapshot file of the Chainer '
                        'trainer from which serialized model parameters '
                        'are extracted. If it is not specified, this '
                        'script searches the training result directory '
                        'for the latest snapshot, assuming that '
                        'the naming convension of snapshot files is '
                        '`snapshot_iter_N` where N is the number of '
                        'iterations, which is the default configuration '
                        'of Chainer.')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID to use. Negative value indicates '
                        'not to use GPU and to run the code in CPU.')
    args = parser.parse_args()

    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:
        config = json.loads(i.read())

    method = config['method']
    labels = config['labels']
    if labels:
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        class_num = len(label_names)

    _, test, _ = data.load_dataset(method, labels)
    y_test = test.get_datasets()[-1]

    # Load pretrained model
    predictor_ = predictor.build_predictor(method, config['unit_num'],
                                           config['conv_layers'], class_num)
    snapshot_file = args.trainer_snapshot
    if not snapshot_file:
        snapshot_file = _find_latest_snapshot(args.in_dir)
    print('Loading pretrained model parameters from {}'.format(snapshot_file))
    chainer.serializers.load_npz(snapshot_file, predictor_,
                                 'updater/model:main/predictor/')

    clf = Classifier(predictor=predictor_,
                     device=args.gpu,
                     lossfun=F.sigmoid_cross_entropy,
                     metrics_fun=F.binary_accuracy)

    # ---- predict ---
    print('Predicting...')

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_pred(x):
        x_array = cuda.to_cpu(x.data)
        return numpy.where(x_array > 0, 1, 0)

    y_pred = clf.predict(test,
                         converter=extract_inputs,
                         postprocess_fn=postprocess_pred)
    y_proba = clf.predict_proba(test,
                                converter=extract_inputs,
                                postprocess_fn=F.sigmoid)

    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)
    print('y_pread.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))
    # `predict_proba` method returns the probability to be toxic
    print('y_proba.shape = {}, y_proba[:5, 0] = {}'.format(
        y_proba.shape, y_proba[:5, 0]))
    # --- predict end ---

    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]

    if y_pred.shape != y_test.shape:
        raise RuntimeError('The shape of the prediction result array and '
                           'that of the ground truth array do not match. '
                           'Contents of the input directory may be corrupted '
                           'or modified.')

    statistics = []
    for t, p in six.moves.zip(y_test.T, y_pred.T):
        idx = t != -1
        n_correct = (t[idx] == p[idx]).sum()
        n_total = len(t[idx])
        accuracy = float(n_correct) / n_total
        statistics.append([n_correct, n_total, accuracy])

    print('{:>6} {:>8} {:>8} {:>8}'.format('TaskID', 'Correct', 'Total',
                                           'Accuracy'))
    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):
        print('task{:>2} {:>8} {:>8} {:>8.4f}'.format(idx, n_correct, n_total,
                                                      accuracy))

    prediction_result_file = 'prediction.npz'
    print('Save prediction result to {}'.format(prediction_result_file))
    numpy.savez_compressed(prediction_result_file, y_pred)

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            clf,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    rocauc_result = ROCAUCEvaluator(test_iterator,
                                    clf,
                                    converter=concat_mols,
                                    device=args.gpu,
                                    eval_func=predictor_,
                                    name='test',
                                    ignore_labels=-1)()
    print('ROCAUC Evaluation result: ', rocauc_result)
示例#19
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--in-dir', '-i', type=str, default='result')
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))
        # class_num = len(labels)

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f:
            ss = pickle.load(f)
    else:
        ss = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    regressor = Regressor.load_pickle(os.path.join(args.in_dir,
                                                   args.model_filename),
                                      device=args.gpu)  # type: Regressor

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if ss is not None:
            # Model's output is scaled by StandardScaler,
            # so we need to rescale back.
            if isinstance(x, Variable):
                x = x.data
                scaled_x = ss.inverse_transform(cuda.to_cpu(x))
                return scaled_x
        else:
            return x

    print('Predicting...')
    y_pred = regressor.predict(val,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))

    t = concat_mols(val, device=-1)[-1]
    n_eval = 10

    # Construct dataframe
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show random 5 example's prediction/ground truth table
    print(df.sample(5))

    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(val_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
示例#20
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

#    # Load the standard scaler parameters, if necessary.
#    if args.scale == 'standardize':
#        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
#        print('Loading scaler parameters from {}.'.format(scaler_path))
#        with open(scaler_path, mode='rb') as f:
#            scaler = pickle.load(f)
#    else:
#        print('No standard scaling was selected.')
#        scaler = None

# Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))


#    # Replace the default predictor with one that scales the output labels.
#    scaled_predictor = ScaledGraphConvPredictor(model.predictor)
#    scaled_predictor.scaler = scaler
#    model.predictor = scaled_predictor

# Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(model_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Load the standard scaler parameters, if necessary.
    if args.scale == 'standardize':
        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
        print('Loading scaler parameters from {}.'.format(scaler_path))
        with open(scaler_path, mode='rb') as f:
            scaler = pickle.load(f)
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)

    # Replace the default predictor with one that scales the output labels.
    scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
    scaled_predictor.scaler = scaler
    regressor.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test, converter=extract_inputs)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    n_eval = 10

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))
    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()

    # Prevents the loss function from becoming a cupy.core.core.ndarray object
    # when using the GPU. This hack will be removed as soon as the cause of
    # the issue is found and properly fixed.
    loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
    eval_result['main/loss'] = loss
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
示例#22
0
def main():
    comm = mn.create_communicator("pure_nccl")
    device = comm.intra_rank

    config = get_config()

    print("pid {}: mask loading...".format(comm.rank))
    load_mask_module = import_module(
        config["additional information"]["mask"]["loader"]["module"],
        config["additional information"]["mask"]["loader"]["package"])
    load_mask = getattr(
        load_mask_module,
        config["additional information"]["mask"]["loader"]["function"])
    mask = load_mask(
        **config["additional information"]["mask"]["loader"]["params"])
    print("pid {}: done.".format(comm.rank))
    if comm.rank == 0:
        print("mask.shape: {}".format(mask.shape))

    model_module = import_module(config["model"]["module"],
                                 config["model"]["package"])
    Model = getattr(model_module, config["model"]["class"])
    model = Model(comm=comm, mask=mask, **config["model"]["params"])

    optimizer_module = import_module(config["optimizer"]["module"],
                                     config["optimizer"]["package"])
    Optimizer = getattr(optimizer_module, config["optimizer"]["class"])
    optimizer = mn.create_multi_node_optimizer(
        Optimizer(**config["optimizer"]["params"]), comm)
    optimizer.setup(model)

    if device >= 0:
        chainer.backends.cuda.get_device_from_id(device).use()
        model.to_gpu()
        print("pid {}: GPU {} enabled".format(comm.rank, device))

    if comm.rank == 0:
        dataset_module = import_module(config["dataset"]["module"],
                                       config["dataset"]["package"])
        Dataset = getattr(dataset_module, config["dataset"]["class"])
        train_dataset = Dataset(**config["dataset"]["train"]["params"])
        valid_dataset = Dataset(**config["dataset"]["valid"]["params"])
    else:
        train_dataset = None
        valid_dataset = None

    train_dataset = mn.datasets.scatter_dataset(train_dataset,
                                                comm,
                                                shuffle=True)
    valid_dataset = mn.datasets.scatter_dataset(valid_dataset,
                                                comm,
                                                shuffle=True)

    train_iterator = Iterator(train_dataset, config["batch"]["train"])
    valid_iterator = Iterator(valid_dataset, config["batch"]["valid"], False,
                              False)

    updater = Updater(train_iterator, optimizer, device=device)
    trainer = Trainer(updater, **config["trainer"]["params"])

    checkpointer = mn.create_multi_node_checkpointer(config["general"]["name"],
                                                     comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer,
                   trigger=tuple(config["trainer"]["snapshot_interval"]))

    evaluator = Evaluator(valid_iterator, model, device=device)
    evaluator = mn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    trainer.extend(observe_lr(), trigger=config["trainer"]["log_interval"])
    if comm.rank == 0:
        trainer.extend(LogReport(trigger=config["trainer"]["log_interval"]))
        trainer.extend(PrintReport(
            ["epoch", "iteration", "main/loss", "validation/main/loss"]),
                       trigger=config["trainer"]["log_interval"])
        trainer.extend(ProgressBar(update_interval=1))

    trainer.run()
示例#23
0
def main(input_args=None):
    # Parse the arguments.
    args = parse_arguments(input_args)
    device = args.gpu
    method = args.method

    if args.data_name == 'suzuki':
        datafile = 'data/suzuki_type_test_v2.csv'
        class_num = 119
        class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17}
        dataset_filename = 'test_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'CN':
        datafile = 'data/CN_coupling_test.csv'
        class_num = 206
        class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74}
        dataset_filename = 'test_CN_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'Negishi':
        datafile = 'data/Negishi_test.csv'
        class_num = 106
        class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30}
        dataset_filename = 'test_Negishi_data.npz'
        labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id']
    elif args.data_name == 'PKR':
        datafile = 'data/PKR_test.csv'
        class_num = 83
        class_dict = {
            'M': 18,
            'L': 6,
            'T': 7,
            'S': 15,
            'A': 11,
            'G': 1,
            'O': 13,
            'P': 4,
            'other': 1
        }
        dataset_filename = 'test_PKR_data.npz'
        labels = [
            'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id'
        ]
    else:
        raise ValueError('Unexpected dataset name')

    cache_dir = os.path.join('input', '{}_all'.format(method))

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        if args.method == 'mpnn':
            preprocessor = preprocess_method_dict['ggnn']()
        else:
            preprocessor = preprocess_method_dict[args.method]()
        parser = CSVFileParser(
            preprocessor,
            postprocess_label=postprocess_label,
            labels=labels,
            smiles_col=['Reactant1', 'Reactant2', 'Product'],
            label_dicts=class_dict)
        dataset = parser.parse(datafile)['dataset']

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    labels = dataset.get_datasets()[-2]
    ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1)
    yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype(
        'float32')  # [:,0] added
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + (
        yields,
        labels,
    )))

    # Load the standard scaler parameters, if necessary.
    scaler = None
    test = dataset

    print('Predicting...')
    # Set up the regressor.
    model_path = os.path.join(args.in_dir, args.model_filename)

    if os.path.exists(model_path):
        classifier = Classifier.load_pickle(model_path, device=args.gpu)
    else:
        predictor = set_up_predictor(args.method, args.unit_num,
                                     args.conv_layers, class_num)
        classifier = Classifier(predictor,
                                lossfun=F.sigmoid_cross_entropy,
                                metrics_fun=F.binary_accuracy,
                                device=args.gpu)

    if args.load_modelname:
        serializers.load_npz(args.load_modelname, classifier)
    scaled_predictor = ScaledGraphConvPredictor(
        graph_conv=classifier.predictor.graph_conv,
        mlp=classifier.predictor.mlp)
    classifier.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    # Prediction function rewrite!!!
    y_pred = classifier.predict(test, converter=extract_inputs)
    y_pred_max = numpy.argmax(y_pred, axis=1)
    y_pred_max = y_pred_max.reshape(-1, 1)
    # y_pred_idx = y_pred.argsort(axis=1) # ascending

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]  # device 11/14 memory issue
    original_t = cuda.to_cpu(t)
    t_idx = original_t.squeeze(1)
    t_idx = t_idx.argsort(axis=1)
    # gt_indx = numpy.where(original_t == 1)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels[:1]):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(),  # [:,-1]
            't_{}'.format(l): t_idx[:, -1].tolist(),
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10

    for target_label in range(y_pred_max.shape[1]):
        label_name = labels[:1][0][target_label]
        print('label_name = {}, y_pred = {}, t = {}'.format(
            label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1]))

    # Perform the prediction.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            classifier,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)

    res_dic = {}
    for i in range(len(y_pred)):
        res_dic[i] = str(ids[i])
    json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w"))

    pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb"))
    pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
def main():
    parser = argparse.ArgumentParser(description="Train a sheep localizer")
    parser.add_argument("train_file", help="path to train csv")
    parser.add_argument("val_file", help="path to validation file (if you do not want to do validation just enter gibberish here")
    parser.add_argument("reference_file", help="path to reference images with different zoom levels")
    parser.add_argument("--no-validation", dest='validation', action='store_false', default=True, help="don't do validation")
    parser.add_argument("--image-size", type=int, nargs=2, default=(224, 224), help="input size for localizer")
    parser.add_argument("--target-size", type=int, nargs=2, default=(75, 75), help="crop size for each image")
    parser.add_argument("-b", "--batch-size", type=int, default=16, help="batch size for training")
    parser.add_argument("-g", "--gpu", type=int, default=-1, help="gpu if to use (-1 means cpu)")
    parser.add_argument("--lr", "--learning-rate", dest="learning_rate", type=float, default=0.001, help="learning rate")
    parser.add_argument("-l", "--log-dir", default='sheep_logs', help="path to log dir")
    parser.add_argument("--ln", "--log-name", default="test", help="name of log")
    parser.add_argument("--num-epoch", type=int, default=100, help="number of epochs to train")
    parser.add_argument("--snapshot-interval", type=int, default=1000, help="number of iterations after which a snapshot will be taken")
    parser.add_argument("--no-snapshot-every-epoch", dest="snapshot_every_epoch", action='store_false', default=True, help="Do not take a snapshot on every epoch")
    parser.add_argument("--log-interval", type=int, default=100, help="log interval")
    parser.add_argument("--port", type=int, default=1337, help="port that is used by bbox plotter to send predictions on test image")
    parser.add_argument("--test-image", help="path to test image that is to be used with bbox plotter")
    parser.add_argument("--anchor-image", help="path to anchor image used for metric learning")
    parser.add_argument("--rl", dest="resume_localizer", help="path to snapshot that is to be used to resume training of localizer")
    parser.add_argument("--rd", dest="resume_discriminator", help="path to snapshot that is to be used to pre-initialize discriminator")
    parser.add_argument("--use-resnet-18", action='store_true', default=False, help="Use Resnet-18 for localization")
    parser.add_argument("--localizer-target", type=float, default=1.0, help="target iou for localizer to reach in the interval [0,1]")
    parser.add_argument("--no-imgaug", action='store_false', dest='use_imgaug', default=True, help="disable image augmentation with `imgaug`, but use naive image augmentation instead")

    args = parser.parse_args()

    report_keys = ["epoch", "iteration", "loss_localizer", "loss_dis", "map", "mean_iou"]

    if args.train_file.endswith('.json'):
        train_image_paths = load_train_paths(args.train_file)
    else:
        train_image_paths = args.train_file

    train_dataset = ImageDataset(
        train_image_paths,
        os.path.dirname(args.train_file),
        image_size=args.image_size,
        dtype=np.float32,
        use_imgaug=args.use_imgaug,
        transform_probability=0.5,
    )

    if args.reference_file == 'mnist':
        reference_dataset = get_mnist(withlabel=False, ndim=3, rgb_format=True)[0]
        args.target_size = (28, 28)
    else:
        reference_dataset = LabeledImageDataset(
            args.reference_file,
            os.path.dirname(args.reference_file),
            image_size=args.target_size,
            dtype=np.float32,
            label_dtype=np.float32,
        )

    if args.validation:
        if args.val_file.endswith('.json'):
            validation_data = load_train_paths(args.val_file, with_label=True)
        else:
            validation_data = args.val_file

        validation_dataset = LabeledImageDataset(validation_data, os.path.dirname(args.val_file), image_size=args.image_size)
        validation_iter = chainer.iterators.MultithreadIterator(validation_dataset, args.batch_size, repeat=False)

    data_iter = chainer.iterators.MultithreadIterator(train_dataset, args.batch_size)
    reference_iter = chainer.iterators.MultithreadIterator(reference_dataset, args.batch_size)

    localizer_class = SheepLocalizer if args.use_resnet_18 else Resnet50SheepLocalizer
    localizer = localizer_class(args.target_size)

    if args.resume_localizer is not None:
        load_pretrained_model(args.resume_localizer, localizer)

    discriminator_output_dim = 1
    discriminator = ResnetAssessor(output_dim=discriminator_output_dim)
    if args.resume_discriminator is not None:
        load_pretrained_model(args.resume_discriminator, discriminator)
    models = [localizer, discriminator]

    localizer_optimizer = chainer.optimizers.Adam(alpha=args.learning_rate, amsgrad=True)
    localizer_optimizer.setup(localizer)

    discriminator_optimizer = chainer.optimizers.Adam(alpha=args.learning_rate, amsgrad=True)
    discriminator_optimizer.setup(discriminator)

    optimizers = [localizer_optimizer, discriminator_optimizer]

    updater_args = {
        "iterator": {
            'main': data_iter,
            'real': reference_iter,
        },
        "device": args.gpu,
        "optimizer": {
            "opt_gen": localizer_optimizer,
            "opt_dis": discriminator_optimizer,
        },
        "create_pca": False,
        "resume_discriminator": args.resume_discriminator,
        "localizer_target": args.localizer_target,
    }

    updater = SheepAssessor(
        models=[localizer, discriminator],
        **updater_args
    )

    log_dir = os.path.join(args.log_dir, "{}_{}".format(datetime.datetime.now().isoformat(), args.ln))
    args.log_dir = log_dir
    # create log dir
    if not os.path.exists(log_dir):
        os.makedirs(log_dir, exist_ok=True)

    trainer = chainer.training.Trainer(updater, (args.num_epoch, 'epoch'), out=args.log_dir)

    data_to_log = {
        'log_dir': args.log_dir,
        'image_size': args.image_size,
        'updater': [updater.__class__.__name__, 'updater.py'],
        'discriminator': [discriminator.__class__.__name__, 'discriminator.py'],
        'discriminator_output_dim': discriminator_output_dim,
        'localizer': [localizer.__class__.__name__, 'localizer.py']
    }

    for argument in filter(lambda x: not x.startswith('_'), dir(args)):
        data_to_log[argument] = getattr(args, argument)

    def backup_train_config(stats_cpu):
        if stats_cpu['iteration'] == args.log_interval:
            stats_cpu.update(data_to_log)

    for model in models:
        trainer.extend(
            extensions.snapshot_object(model, model.__class__.__name__ + '_{.updater.iteration}.npz'),
            trigger=lambda trainer: trainer.updater.is_new_epoch if args.snapshot_every_epoch else trainer.updater.iteration % args.snapshot_interval == 0,
        )

    # log train information everytime we encouter a new epoch or args.log_interval iterations have been done
    log_interval_trigger = (lambda trainer:
                            trainer.updater.is_new_epoch or trainer.updater.iteration % args.log_interval == 0)

    sheep_evaluator = SheepMAPEvaluator(localizer, args.gpu)
    if args.validation:
        trainer.extend(
            Evaluator(validation_iter, localizer, device=args.gpu, eval_func=sheep_evaluator),
            trigger=log_interval_trigger,
        )

    models.append(updater)
    logger = Logger(
        [get_definition_filepath(model) for model in models],
        args.log_dir,
        postprocess=backup_train_config,
        trigger=log_interval_trigger,
        dest_file_names=['localizer.py', 'discriminator.py', 'updater.py'],
    )

    if args.test_image is not None:
        plot_image = load_image(args.test_image, args.image_size)
        gt_bbox = None
    else:
        if args.validation:
            plot_image, gt_bbox, _ = validation_dataset.get_example(0)
        else:
            plot_image = train_dataset.get_example(0)
            gt_bbox = None

    bbox_plotter = BBOXPlotter(
        plot_image,
        os.path.join(args.log_dir, 'bboxes'),
        args.target_size,
        send_bboxes=True,
        upstream_port=args.port,
        visualization_anchors=[
            ["visual_backprop_anchors"],
        ],
        device=args.gpu,
        render_extracted_rois=True,
        num_rois_to_render=4,
        show_visual_backprop_overlay=False,
        show_backprop_and_feature_vis=True,
        gt_bbox=gt_bbox,
        render_pca=True,
        log_name=args.ln,
    )
    trainer.extend(bbox_plotter, trigger=(1, 'iteration'))

    trainer.extend(
        logger,
        trigger=log_interval_trigger
    )
    trainer.extend(
        extensions.PrintReport(report_keys, log_report='Logger'),
        trigger=log_interval_trigger
    )

    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(extensions.dump_graph('loss_localizer', out_name='model.dot'))

    open_interactive_prompt(
        bbox_plotter=bbox_plotter,
        optimizer=optimizers,
    )

    trainer.run()