示例#1
0
def download_entire_dataset(dataset_name, num_data, labels, method, cache_dir):
    """Downloads the train/valid/test parts of a dataset and stores them in the
    cache directory.
    Args:
        dataset_name: Dataset to be downloaded.
        num_data: Amount of data samples to be parsed from the dataset.
        labels: Target labels for regression.
        method: Method name. See `parse_arguments`.
        cache_dir: Directory to store the dataset to.
    """

    print('Downloading {}...'.format(dataset_name))
    preprocessor = preprocess_method_dict[method]()

    # Select the first `num_data` samples from the dataset.
    target_index = numpy.arange(num_data) if num_data >= 0 else None
    dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor,
                                                labels=labels,
                                                target_index=target_index)
    dataset_parts = dataset_parts['dataset']

    # Cache the downloaded dataset.
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    for i, part in enumerate(['train', 'valid', 'test']):
        filename = dataset_part_filename(part, num_data)
        path = os.path.join(cache_dir, filename)
        NumpyTupleDataset.save(path, dataset_parts[i])
    return dataset_parts
示例#2
0
def download_entire_dataset(dataset_name, num_data, labels, method, cache_dir):
    """Downloads the train/valid/test parts of a dataset and stores them in the
    cache directory.
    Args:
        dataset_name: Dataset to be downloaded.
        num_data: Amount of data samples to be parsed from the dataset.
        labels: Target labels for regression.
        method: Method name. See `parse_arguments`.
        cache_dir: Directory to store the dataset to.
    """

    print('Downloading {}...'.format(dataset_name))
    preprocessor = preprocess_method_dict[method]()

    # Select the first `num_data` samples from the dataset.
    target_index = numpy.arange(num_data) if num_data >= 0 else None
    dataset_parts = D.molnet.get_molnet_dataset(dataset_name,
                                                preprocessor,
                                                labels=labels,
                                                target_index=target_index)
    dataset_parts = dataset_parts['dataset']

    # Cache the downloaded dataset.
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    for i, part in enumerate(['train', 'valid', 'test']):
        filename = dataset_part_filename(part, num_data)
        path = os.path.join(cache_dir, filename)
        NumpyTupleDataset.save(path, dataset_parts[i])
    return dataset_parts
示例#3
0
    def test_save_load(self, data):
        tmp_cache_path = os.path.join(tempfile.mkdtemp(), 'tmp.npz')
        dataset = NumpyTupleDataset(*data)
        NumpyTupleDataset.save(tmp_cache_path, dataset)
        assert os.path.exists(tmp_cache_path)
        load_dataset = NumpyTupleDataset.load(tmp_cache_path)
        os.remove(tmp_cache_path)

        assert len(dataset._datasets) == len(load_dataset._datasets)
        for a, d in six.moves.zip(dataset._datasets, load_dataset._datasets):
            numpy.testing.assert_array_equal(a, d)
示例#4
0
def fit_scaler(datasets):
    """Standardizes (scales) the dataset labels.
    Args:
        datasets: Tuple containing the datasets.
    Returns:
        Datasets with standardized labels and the scaler object.
    """
    scaler = StandardScaler()

    # Collect all labels in order to apply scaling over the entire dataset.
    labels = None
    offsets = []
    for dataset in datasets:
        if labels is None:
            labels = dataset.get_datasets()[-1]
        else:
            labels = numpy.vstack([labels, dataset.get_datasets()[-1]])
        offsets.append(len(labels))

    scaler.fit(labels)
    scaled_datasets = []
    for dataset in datasets:
        x, y = dataset.get_datasets()
        yy = scaler.transform(y)
        scaled_datasets.append(NumpyTupleDataset(x, yy))
    return scaler, scaled_datasets
示例#5
0
    def test_get_item_integer_index(self, data, index):
        dataset = NumpyTupleDataset(*data)
        actual = dataset[index]

        assert len(actual) == len(data)
        for a, d in six.moves.zip(actual, data):
            numpy.testing.assert_array_equal(a, d[index])
示例#6
0
def augment_dataset(dataset):
    dataset_tuple = dataset.get_datasets()
    feat1, feat2, labels = dataset_tuple
    new_feat1 = np.concatenate((feat1, feat2), axis=0)
    new_feat2 = np.concatenate((feat2, feat1), axis=0)
    new_labels = np.concatenate((labels, labels), axis=0)
    new_dataset = NumpyTupleDataset(new_feat1, new_feat2, new_labels)
    return new_dataset
示例#7
0
def augment_dataset(dataset):
    dataset_tuple = dataset.get_datasets()
    atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple
    new_atoms1 = np.concatenate((atoms1, atoms2), axis=0)
    new_atoms2 = np.concatenate((atoms2, atoms1), axis=0)
    new_adjs1 = np.concatenate((adjs1, adjs2), axis=0)
    new_adjs2 = np.concatenate((adjs2, adjs1), axis=0)
    new_labels = np.concatenate((labels, labels), axis=0)
    new_dataset = NumpyTupleDataset(new_atoms1, new_adjs1, new_atoms2, new_adjs2, new_labels)
    return new_dataset
def main():
    """Launcher."""
    preprocessor = preprocess_method_dict["nfp"]()
    dataset = datasets.get_qm9(preprocessor, labels="h**o")
    cache_dir = "data/"
    if not (os.path.exists(cache_dir)):
        os.makedirs(cache_dir)
    NumpyTupleDataset.save(cache_dir + "data.npz", dataset)
    dataset = NumpyTupleDataset.load(cache_dir + 'data.npz')
    train_data_ratio = 0.7
    train_data_size = int(len(dataset) * train_data_ratio)
    train, validation = split_dataset_random(dataset, train_data_size, 777)
    print('train dataset size:', len(train))
    print('validation dataset size:', len(validation))

    n_unit = 16
    conv_layers = 4
    model = GraphConvPredictor(NFP(n_unit, n_unit, conv_layers),
                               MLP(n_unit, 1))
示例#9
0
    def test_get_item_list_index(self, long_data, index):
        dataset = NumpyTupleDataset(*long_data)
        actual = dataset[index]

        batches = [d[index] for d in long_data]
        length = len(batches[0])
        expect = [tuple([batch[i] for batch in batches])
                  for i in six.moves.range(length)]

        assert len(actual) == len(expect)
        for tuple_a, tuple_e in six.moves.zip(actual, expect):
            assert len(tuple_a) == len(tuple_e)
            for a, e in six.moves.zip(tuple_a, tuple_e):
                numpy.testing.assert_array_equal(a, e)
def add_super_nodes(dataset, smiles_pairs):
    dataset_tuple = dataset.get_datasets()
    atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple
    smiles1 = smiles_pairs[:, 0]
    smiles2 = smiles_pairs[:, 1]

    pickle_filename = 'smiles2mol_vec.pkl'
    pickle_filepath = os.path.join(SUPER_NODE_PATH, pickle_filename)
    with open(pickle_filepath, 'r') as reader:
        smiles2atom_feat = pickle.load(reader)
    atom_feat1 = [smiles2atom_feat[smiles] for smiles in smiles1]
    atom_feat1 = np.array(atom_feat1, dtype=np.float32)
    atom_feat2 = [smiles2atom_feat[smiles] for smiles in smiles2]
    atom_feat2 = np.array(atom_feat2, dtype=np.float32)

    new_dataset = NumpyTupleDataset(atoms1, adjs1, atom_feat1, atoms2, adjs2,
                                    atom_feat2, labels)
    return new_dataset
def to_multi_hot_labels(dataset, enc=None):
    dataset_tuple = dataset.get_datasets()
    atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple
    label_list = list()
    for label in labels:
        try:
            label = [int(label)]
        except ValueError:
            label_in_str = str(label[0])
            assert isinstance(label_in_str, str)
            label = map(int, label_in_str.split('||'))
        label_list.append(label)

    if enc is None:
        enc = MultiLabelBinarizer()
        transformed_labels = enc.fit_transform(label_list)
    else:
        transformed_labels = enc.transform(label_list)
    # enc_filename = 'multi_hot_enc.pkl'
    # enc_filepath = os.path.join(KAIST_PATH, enc_filename)
    #
    # if not os.path.exists(enc_filepath):
    #     if not is_training:
    #         raise ValueError('The multi-hot encoder file does not exist.')
    #     else:
    #         enc.fit(label_list)
    #         joblib.dump(enc, enc_filepath)
    #         transformed_labels = enc.transform(label_list)
    #
    # else:
    #     enc = joblib.load(enc_filepath)
    #     transformed_labels = enc.transform(label_list)

    new_dataset = NumpyTupleDataset(
        atoms1, adjs1, atoms2, adjs2, transformed_labels
    )
    return new_dataset, enc
def to_one_hot_labels(dataset, is_training=True):
    dataset_tuple = dataset.get_datasets()
    atoms1, adjs1, atoms2, adjs2, labels = dataset_tuple
    onehot_enc = OneHotEncoder(handle_unknown='ignore')

    onehot_filename = 'onehot_enc.pkl'
    onehot_filepath = os.path.join(KAIST_PATH, onehot_filename)

    if not os.path.exists(onehot_filepath):
        if not is_training:
            raise ValueError('The one-hot encoder file does not exist.')
        else:
            onehot_enc.fit(labels)
            joblib.dump(onehot_enc, onehot_filepath)
            transformed_labels = onehot_enc.transform(labels).toarray().astype(np.int32)

    else:
        onehot_enc = joblib.load(onehot_filepath)
        transformed_labels = onehot_enc.transform(labels).toarray().astype(np.int32)

    new_dataset = NumpyTupleDataset(
        atoms1, adjs1, atoms2, adjs2, transformed_labels
    )
    return new_dataset
示例#13
0
def main():
    # Parse the arguments.
    args = parse_arguments()
    device = args.gpu

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=device)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        original_t = dataset.get_datasets()[-1]
        if args.gpu >= 0:
            scaled_t = cuda.to_cpu(scaler.transform(
                cuda.to_gpu(original_t)))
        else:
            scaled_t = scaler.transform(original_t)

        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t,)))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(
        test, converter=extract_inputs,
        postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=device)[-1]
    original_t = cuda.to_cpu(scaler.inverse_transform(t))

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                        't_{}'.format(l): original_t[:, i], })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'
              .format(label_name, y_pred[:n_eval, target_label],
                      original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
                            device=device)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
示例#14
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor,
                                labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Fit StandardScaler to the labels.')
        scaler = StandardScaler()
        scaler.fit(dataset.get_datasets()[-1])
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the regressor.
    device = chainer.get_device(args.device)
    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
    regressor = Regressor(predictor,
                          lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun,
                          device=device)

    print('Training...')
    run_train(regressor,
              train,
              valid=valid,
              batch_size=args.batchsize,
              epoch=args.epoch,
              out=args.out,
              extensions_list=None,
              device=device,
              converter=concat_mols,
              resume_path=None)

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)
示例#15
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    # Apply a preprocessor to the dataset.
    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,
                           labels=labels, smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,)))
    else:
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, _ = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(args.method, args.unit_num,
                                 args.conv_layers, class_num)

    # Set up the iterator.
    train_iter = SerialIterator(train, args.batchsize)

    # Set up the regressor.
    metrics_fun = {'mean_abs_error': MeanAbsError(scaler=scaler),
                   'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)}
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=args.gpu)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    print('Training...')
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/mean_abs_error',
                                  'main/root_mean_sqr_error', 'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)

    # Save the standard scaler's parameters.
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=args.protocol)
示例#16
0
 def test_len(self, data):
     dataset = NumpyTupleDataset(*data)
     assert len(dataset) == 2
示例#17
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Dataset preparation
    dataset = None

    if os.path.exists(cache_dir):
        print('load from cache {}'.format(cache_dir))
        dataset = NumpyTupleDataset.load(os.path.join(cache_dir, 'data.npz'))
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        os.makedirs(cache_dir)
        NumpyTupleDataset.save(os.path.join(cache_dir, 'data.npz'), dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        ss = StandardScaler()
        labels = ss.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*dataset.get_datasets()[:-1], labels)

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = I.SerialIterator(train, args.batchsize)
    val_iter = I.SerialIterator(val,
                                args.batchsize,
                                repeat=False,
                                shuffle=False)

    def scaled_abs_error(x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if args.scale == 'standardize':
            scaled_x0 = ss.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = ss.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        elif args.scale == 'none':
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]

    classifier = L.Classifier(model,
                              lossfun=F.mean_squared_error,
                              accfun=scaled_abs_error)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        classifier.to_gpu()

    optimizer = O.Adam()
    optimizer.setup(classifier)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()
示例#18
0
def main(input_args=None):
    # Parse the arguments.
    args = parse_arguments(input_args)
    device = args.gpu
    method = args.method

    if args.data_name == 'suzuki':
        datafile = 'data/suzuki_type_test_v2.csv'
        class_num = 119
        class_dict = {'M': 28, 'L': 23, 'B': 35, 'S': 10, 'A': 17}
        dataset_filename = 'test_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'CN':
        datafile = 'data/CN_coupling_test.csv'
        class_num = 206
        class_dict = {'M': 44, 'L': 47, 'B': 13, 'S': 22, 'A': 74}
        dataset_filename = 'test_CN_data.npz'
        labels = ['Yield', 'M', 'L', 'B', 'S', 'A', 'id']
    elif args.data_name == 'Negishi':
        datafile = 'data/Negishi_test.csv'
        class_num = 106
        class_dict = {'M': 32, 'L': 20, 'T': 8, 'S': 10, 'A': 30}
        dataset_filename = 'test_Negishi_data.npz'
        labels = ['Yield', 'M', 'L', 'T', 'S', 'A', 'id']
    elif args.data_name == 'PKR':
        datafile = 'data/PKR_test.csv'
        class_num = 83
        class_dict = {
            'M': 18,
            'L': 6,
            'T': 7,
            'S': 15,
            'A': 11,
            'G': 1,
            'O': 13,
            'P': 4,
            'other': 1
        }
        dataset_filename = 'test_PKR_data.npz'
        labels = [
            'Yield', 'M', 'L', 'T', 'S', 'A', 'G', 'O', 'P', 'other', 'id'
        ]
    else:
        raise ValueError('Unexpected dataset name')

    cache_dir = os.path.join('input', '{}_all'.format(method))

    # Dataset preparation.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        if args.method == 'mpnn':
            preprocessor = preprocess_method_dict['ggnn']()
        else:
            preprocessor = preprocess_method_dict[args.method]()
        parser = CSVFileParser(
            preprocessor,
            postprocess_label=postprocess_label,
            labels=labels,
            smiles_col=['Reactant1', 'Reactant2', 'Product'],
            label_dicts=class_dict)
        dataset = parser.parse(datafile)['dataset']

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    labels = dataset.get_datasets()[-2]
    ids = dataset.get_datasets()[-1][:, 1].reshape(-1, 1)
    yields = dataset.get_datasets()[-1][:, 0].reshape(-1, 1).astype(
        'float32')  # [:,0] added
    dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-2] + (
        yields,
        labels,
    )))

    # Load the standard scaler parameters, if necessary.
    scaler = None
    test = dataset

    print('Predicting...')
    # Set up the regressor.
    model_path = os.path.join(args.in_dir, args.model_filename)

    if os.path.exists(model_path):
        classifier = Classifier.load_pickle(model_path, device=args.gpu)
    else:
        predictor = set_up_predictor(args.method, args.unit_num,
                                     args.conv_layers, class_num)
        classifier = Classifier(predictor,
                                lossfun=F.sigmoid_cross_entropy,
                                metrics_fun=F.binary_accuracy,
                                device=args.gpu)

    if args.load_modelname:
        serializers.load_npz(args.load_modelname, classifier)
    scaled_predictor = ScaledGraphConvPredictor(
        graph_conv=classifier.predictor.graph_conv,
        mlp=classifier.predictor.mlp)
    classifier.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    # Prediction function rewrite!!!
    y_pred = classifier.predict(test, converter=extract_inputs)
    y_pred_max = numpy.argmax(y_pred, axis=1)
    y_pred_max = y_pred_max.reshape(-1, 1)
    # y_pred_idx = y_pred.argsort(axis=1) # ascending

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]  # device 11/14 memory issue
    original_t = cuda.to_cpu(t)
    t_idx = original_t.squeeze(1)
    t_idx = t_idx.argsort(axis=1)
    # gt_indx = numpy.where(original_t == 1)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels[:1]):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred_max[:, -1].tolist(),  # [:,-1]
            't_{}'.format(l): t_idx[:, -1].tolist(),
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10

    for target_label in range(y_pred_max.shape[1]):
        label_name = labels[:1][0][target_label]
        print('label_name = {}, y_pred = {}, t = {}'.format(
            label_name, y_pred_max[:n_eval, target_label], t_idx[:n_eval, -1]))

    # Perform the prediction.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            classifier,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)

    res_dic = {}
    for i in range(len(y_pred)):
        res_dic[i] = str(ids[i])
    json.dump(res_dic, open(os.path.join(args.in_dir, "test_ids.json"), "w"))

    pickle.dump(y_pred, open(os.path.join(args.in_dir, "pred.pkl"), "wb"))
    pickle.dump(original_t, open(os.path.join(args.in_dir, "gt.pkl"), "wb"))
示例#19
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        label = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, label))
        labels = [label]
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)
    scaler = regressor.predictor.scaler

    if scaler is not None:
        scaled_t = scaler.transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if scaler is not None:
            scaled_x = scaler.inverse_transform(x)
            return scaled_x
        else:
            return x

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    original_t = scaler.inverse_transform(t)

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): original_t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))

    n_eval = 10
    for target_label in range(y_pred.shape[1]):
        label_name = labels[target_label]
        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,
                                                          target_label]
        print('label_name = {}, y_pred = {}, t = {}, diff = {}'.format(
            label_name, y_pred[:n_eval, target_label],
            original_t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
    # Save the evaluation results.
    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)

    # Calculate mean abs error for each label
    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)
    eval_result = {}
    for i, l in enumerate(labels):
        eval_result.update({l: mae[i]})
    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)
示例#20
0
def train():
    parser = argparser.get_parser()
    args = parser.parse_args()

    device = -1
    if args.gpu >= 0:
        device = args.gpu
    debug = args.debug
    print('input args:\n',
          json.dumps(vars(args), indent=4,
                     separators=(',', ':')))  # pretty print args

    if args.data_name == 'qm9':
        from data import transform_qm9
        transform_fn = transform_qm9.transform_fn
        atomic_num_list = [6, 7, 8, 9, 0]
        mlp_channels = [256, 256]
        gnn_channels = {'gcn': [8, 64], 'hidden': [128, 64]}
        valid_idx = transform_qm9.get_val_ids()
    elif args.data_name == 'zinc250k':
        transform_fn = transform_fn_zinc250k
        atomic_num_list = zinc250_atomic_num_list
        mlp_channels = [1024, 512]
        gnn_channels = {'gcn': [16, 128], 'hidden': [256, 64]}
        valid_idx = transform_zinc250k.get_val_ids()

    dataset = NumpyTupleDataset.load(
        os.path.join(args.data_dir, args.data_file))
    dataset = TransformDataset(dataset, transform_fn)

    if len(valid_idx) > 0:
        train_idx = [t for t in range(len(dataset)) if t not in valid_idx]
        n_train = len(train_idx)
        train_idx.extend(valid_idx)
        train, test = chainer.datasets.split_dataset(dataset, n_train,
                                                     train_idx)
    else:
        train, test = chainer.datasets.split_dataset_random(
            dataset, int(len(dataset) * 0.8), seed=args.seed)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    num_masks = {
        'node': args.num_node_masks,
        'channel': args.num_channel_masks
    }
    mask_size = {
        'node': args.node_mask_size,
        'channel': args.channel_mask_size
    }
    num_coupling = {
        'node': args.num_node_coupling,
        'channel': args.num_channel_coupling
    }
    model_params = Hyperparameters(
        args.num_atoms,
        args.num_rels,
        len(atomic_num_list),
        num_masks=num_masks,
        mask_size=mask_size,
        num_coupling=num_coupling,
        batch_norm=args.apply_batch_norm,
        additive_transformations=args.additive_transformations,
        learn_dist=args.learn_dist,
        mlp_channels=mlp_channels,
        gnn_channels=gnn_channels)

    model = GraphNvpModel(model_params)

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu(device)

    print('==========================================')
    if device >= 0:
        print('Using GPUs')
    print('Num Minibatch-size: {}'.format(args.batch_size))
    print('Num epoch: {}'.format(args.max_epochs))
    print('==========================================')
    os.makedirs(args.save_dir, exist_ok=True)
    model.save_hyperparams(os.path.join(args.save_dir, 'graphnvp-params.json'))

    opt = chainer.optimizers.Adam()
    opt.setup(model)
    updater = MolNvpUpdater(train_iter, opt, device=device, loss_func=None)
    trainer = training.Trainer(updater, (args.max_epochs, 'epoch'),
                               out=args.save_dir)

    # trainer.extend(extensions.dump_graph('log_likelihood'))

    def print_validity(t):
        adj, x = generate_mols(model, batch_size=100, gpu=device)
        valid_mols = check_validity(adj, x, atomic_num_list,
                                    device)['valid_mols']
        mol_dir = os.path.join(args.save_dir,
                               'generated_{}'.format(t.updater.epoch))
        # mol_dir = os.path.join(args.save_dir, 'generated_{}'.format(t.updater.iteration))
        os.makedirs(mol_dir, exist_ok=True)
        for ind, mol in enumerate(valid_mols):
            save_mol_png(mol, os.path.join(mol_dir, '{}.png'.format(ind)))

    if debug:
        # trainer.extend(print_validity, trigger=(1, 'epoch'))
        trainer.extend(print_validity, trigger=(100, 'iteration'))
    save_epochs = args.save_epochs
    if save_epochs == -1:
        save_epochs = args.max_epochs

    trainer.extend(extensions.snapshot(), trigger=(save_epochs, 'epoch'))
    # trainer.extend(extensions.PlotReport(['log_likelihood'], 'epoch', file_name='qm9.png'),
    #                trigger=(100, 'iteration'))
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'log_likelihood', 'nll_x', 'nll_adj', 'elapsed_time']))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.ProgressBar())
    if args.load_params == 1:
        chainer.serializers.load_npz(args.load_snapshot, trainer)
    trainer.run()
    chainer.serializers.save_npz(
        os.path.join(args.save_dir, 'graph-nvp-final.npz'), model)
示例#21
0
 def test_invalid_datasets(self):
     a = numpy.array([1, 2])
     b = numpy.array([1, 2, 3])
     with pytest.raises(ValueError):
         NumpyTupleDataset(a, b)
示例#22
0
def modify_dataset_for_hinge(dataset):
    atoms1, adjs1, atoms2, adjs2, labels = dataset.get_datasets()
    labels_squeezed = np.squeeze(labels, axis=1)
    new_dataset = NumpyTupleDataset(atoms1, adjs1, atoms2, adjs2, labels_squeezed)
    return new_dataset
示例#23
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers

    task_type = molnet_default_config[dataset_name]['task_type']
    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }

    print('Using dataset: {}...'.format(dataset_name))

    # Set up some useful variables that will be used later on.
    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Load the train and validation parts of the dataset.
    filenames = [
        dataset_part_filename(p, num_data) for p in ['train', 'valid']
    ]

    paths = [os.path.join(cache_dir, f) for f in filenames]
    if all([os.path.exists(path) for path in paths]):
        dataset_parts = []
        for path in paths:
            print('Loading cached dataset from {}.'.format(path))
            dataset_parts.append(NumpyTupleDataset.load(path))
    else:
        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,
                                                method, cache_dir)
    train, valid = dataset_parts[0], dataset_parts[1]

    # Scale the label values, if necessary.
    scaler = None
    if args.scale == 'standardize':
        if task_type == 'regression':
            print('Applying standard scaling to the labels.')
            scaler = fit_scaler(dataset_parts)
        else:
            print('Label scaling is not available for classification tasks.')
    else:
        print('No label scaling was selected.')

    # Set up the predictor.
    predictor = set_up_predictor(method,
                                 n_unit,
                                 conv_layers,
                                 class_num,
                                 label_scaler=scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid,
                                          args.batchsize,
                                          repeat=False,
                                          shuffle=False)

    # Load metrics for the current dataset.
    metrics = molnet_default_config[dataset_name]['metrics']
    metrics_fun = {
        k: v
        for k, v in metrics.items() if isinstance(v, types.FunctionType)
    }
    loss_fun = molnet_default_config[dataset_name]['loss']

    device = chainer.get_device(args.device)
    if task_type == 'regression':
        model = Regressor(predictor,
                          lossfun=loss_fun,
                          metrics_fun=metrics_fun,
                          device=device)
    elif task_type == 'classification':
        model = Classifier(predictor,
                           lossfun=loss_fun,
                           metrics_fun=metrics_fun,
                           device=device)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Save model-related output to this directory.
    model_dir = os.path.join(args.out, os.path.basename(cache_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)
    trainer.extend(
        E.Evaluator(valid_iter, model, device=device, converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())

    # TODO: consider go/no-go of the following block
    # # (i) more reporting for val/evalutaion
    # # (ii) best validation score snapshot
    # if task_type == 'regression':
    #     metric_name_list = list(metrics.keys())
    #     if 'RMSE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))
    #     elif 'MAE' in metric_name_list:
    #         trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]),
    #                        trigger=training.triggers.MinValueTrigger('validation/main/MAE'))
    #     else:
    #         print("[WARNING] No validation metric defined?")
    #
    # elif task_type == 'classification':
    #     train_eval_iter = iterators.SerialIterator(
    #         train, args.batchsize, repeat=False, shuffle=False)
    #     trainer.extend(ROCAUCEvaluator(
    #         train_eval_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='train',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #     # extension name='validation' is already used by `Evaluator`,
    #     # instead extension name `val` is used.
    #     trainer.extend(ROCAUCEvaluator(
    #         valid_iter, predictor, eval_func=predictor,
    #         device=args.gpu, converter=concat_mols, name='val',
    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))
    #
    #     trainer.extend(E.snapshot_object(
    #         model, "best_val_" + model_filename[task_type]),
    #         trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))
    # else:
    #     raise NotImplementedError(
    #         'Not implemented task_type = {}'.format(task_type))

    trainer.extend(AutoPrintReport())
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the model's parameters.
    model_path = os.path.join(model_dir, model_filename[task_type])
    print('Saving the trained model to {}...'.format(model_path))
    model.save_pickle(model_path, protocol=args.protocol)
示例#24
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor, labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Applying standard scaling to the labels.')
        scaler = StandardScaler()
        scaled_t = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1]
                                      + (scaled_t,)))
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Set up the regressor.
    device = args.gpu
    metrics_fun = {'mae': MeanAbsError(scaler=scaler),
                   'rmse': RootMeanSqrError(scaler=scaler)}
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=device)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(valid_iter, regressor, device=device,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport([
        'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss',
        'validation/main/mae', 'validation/main/rmse', 'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)
示例#25
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    label_names = [
        'A', 'B', 'C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'r2', 'zpve',
        'U0', 'U', 'H', 'G', 'Cv'
    ]
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(description='Regression with QM9.')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        choices=label_names,
                        default='',
                        help='target label for regression, '
                        'empty string means to predict all '
                        'property at once')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--in-dir', '-i', type=str, default='result')
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--model-filename', type=str, default='regressor.pkl')
    parser.add_argument('--num-data',
                        type=int,
                        default=-1,
                        help='Number of data to be parsed from parser.'
                        '-1 indicates to parse all data.')
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        # class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))
        # class_num = len(labels)

    # Dataset preparation
    dataset = None

    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    dataset_cache_path = os.path.join(cache_dir, dataset_filename)
    if os.path.exists(dataset_cache_path):
        print('load from cache {}'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    if args.scale == 'standardize':
        # Standard Scaler for labels
        with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f:
            ss = pickle.load(f)
    else:
        ss = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    regressor = Regressor.load_pickle(os.path.join(args.in_dir,
                                                   args.model_filename),
                                      device=args.gpu)  # type: Regressor

    # We need to feed only input features `x` to `predict`/`predict_proba`.
    # This converter extracts only inputs (x1, x2, ...) from the features which
    # consist of input `x` and label `t` (x1, x2, ..., t).
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    def postprocess_fn(x):
        if ss is not None:
            # Model's output is scaled by StandardScaler,
            # so we need to rescale back.
            if isinstance(x, Variable):
                x = x.data
                scaled_x = ss.inverse_transform(cuda.to_cpu(x))
                return scaled_x
        else:
            return x

    print('Predicting...')
    y_pred = regressor.predict(val,
                               converter=extract_inputs,
                               postprocess_fn=postprocess_fn)

    print('y_pred.shape = {}, y_pred[:5, 0] = {}'.format(
        y_pred.shape, y_pred[:5, 0]))

    t = concat_mols(val, device=-1)[-1]
    n_eval = 10

    # Construct dataframe
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show random 5 example's prediction/ground truth table
    print(df.sample(5))

    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # --- evaluate ---
    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
    print('Evaluating...')
    val_iterator = SerialIterator(val, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(val_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)
示例#26
0
if data_name == 'qm9':
    max_atoms = 9
elif data_name == 'zinc250k':
    max_atoms = 38
else:
    raise ValueError("[ERROR] Unexpected value data_name={}".format(data_name))

if data_type == 'gcn':
    preprocessor = RSGCNPreprocessor(out_size=max_atoms)
elif data_type == 'relgcn':
    # preprocessor = GGNNPreprocessor(out_size=max_atoms, kekulize=True, return_is_real_node=False)
    preprocessor = GGNNPreprocessor(out_size=max_atoms, kekulize=True)
else:
    raise ValueError("[ERROR] Unexpected value data_type={}".format(data_type))

data_dir = "."
os.makedirs(data_dir, exist_ok=True)

if data_name == 'qm9':
    dataset = datasets.get_qm9(preprocessor)
elif data_name == 'zinc250k':
    dataset = datasets.get_zinc250k(preprocessor)
else:
    raise ValueError("[ERROR] Unexpected value data_name={}".format(data_name))

NumpyTupleDataset.save(
    os.path.join(data_dir,
                 '{}_{}_kekulized_ggnp.npz'.format(data_name, data_type)),
    dataset)
示例#27
0
def dataset():
    a = numpy.random.random((10, 10))
    b = numpy.random.random((10, 8))
    c = numpy.random.random((10, 1))
    return NumpyTupleDataset(a, b, c)
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
    else:
        labels = D.get_qm9_label_names()
        cache_dir = os.path.join('input', '{}_all'.format(method))

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached data from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the newly preprocessed dataset.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Load the standard scaler parameters, if necessary.
    if args.scale == 'standardize':
        scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
        print('Loading scaler parameters from {}.'.format(scaler_path))
        with open(scaler_path, mode='rb') as f:
            scaler = pickle.load(f)
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and testing.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    _, test = split_dataset_random(dataset, train_data_size, args.seed)

    # Use a predictor with scaled output labels.
    model_path = os.path.join(args.in_dir, args.model_filename)
    regressor = Regressor.load_pickle(model_path, device=args.gpu)

    # Replace the default predictor with one that scales the output labels.
    scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
    scaled_predictor.scaler = scaler
    regressor.predictor = scaled_predictor

    # This callback function extracts only the inputs and discards the labels.
    def extract_inputs(batch, device=None):
        return concat_mols(batch, device=device)[:-1]

    # Predict the output labels.
    print('Predicting...')
    y_pred = regressor.predict(test, converter=extract_inputs)

    # Extract the ground-truth labels.
    t = concat_mols(test, device=-1)[-1]
    n_eval = 10

    # Construct dataframe.
    df_dict = {}
    for i, l in enumerate(labels):
        df_dict.update({
            'y_pred_{}'.format(l): y_pred[:, i],
            't_{}'.format(l): t[:, i],
        })
    df = pandas.DataFrame(df_dict)

    # Show a prediction/ground truth table with 5 random examples.
    print(df.sample(5))
    for target_label in range(y_pred.shape[1]):
        diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
        print('target_label = {}, y_pred = {}, t = {}, diff = {}'.format(
            target_label, y_pred[:n_eval, target_label],
            t[:n_eval, target_label], diff))

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            regressor,
                            converter=concat_mols,
                            device=args.gpu)()

    # Prevents the loss function from becoming a cupy.core.core.ndarray object
    # when using the GPU. This hack will be removed as soon as the cause of
    # the issue is found and properly fixed.
    loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
    eval_result['main/loss'] = loss
    print('Evaluation result: ', eval_result)

    # Save the evaluation results.
    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
        json.dump(eval_result, f)
示例#29
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data

    if args.label:
        labels = args.label
        cache_dir = os.path.join(
            'input', '{}_{}_{}'.format(dataset_name, method, labels))
    else:
        labels = None
        cache_dir = os.path.join('input',
                                 '{}_{}_all'.format(dataset_name, method))

    # Load the cached dataset.
    filename = dataset_part_filename('test', num_data)
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        print('Loading cached dataset from {}.'.format(path))
        test = NumpyTupleDataset.load(path)
    else:
        _, _, test = download_entire_dataset(dataset_name, num_data, labels,
                                             method, cache_dir)

    # Model-related data is stored this directory.
    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))

    model_filename = {
        'classification': 'classifier.pkl',
        'regression': 'regressor.pkl'
    }
    task_type = molnet_default_config[dataset_name]['task_type']
    model_path = os.path.join(model_dir, model_filename[task_type])
    print("model_path=" + model_path)
    print('Loading model weights from {}...'.format(model_path))

    if task_type == 'classification':
        model = Classifier.load_pickle(model_path, device=args.gpu)
    elif task_type == 'regression':
        model = Regressor.load_pickle(model_path, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Re-load the best-validation score snapshot
    # serializers.load_npz(os.path.join(
    #     model_dir, "best_val_" + model_filename[task_type]), model)

    # Run an evaluator on the test dataset.
    print('Evaluating...')
    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
    eval_result = Evaluator(test_iterator,
                            model,
                            converter=concat_mols,
                            device=args.gpu)()
    print('Evaluation result: ', eval_result)

    # Add more stats
    if task_type == 'regression':
        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))
        # eval_result['main/loss'] = loss

        # convert to native values..
        for k, v in eval_result.items():
            eval_result[k] = float(v)

    elif task_type == "classification":
        # For Classifier, we do not equip the model with ROC-AUC evalation function
        # use a seperate ROC-AUC Evaluator here
        rocauc_result = ROCAUCEvaluator(test_iterator,
                                        model,
                                        converter=concat_mols,
                                        device=args.gpu,
                                        eval_func=model.predictor,
                                        name='test',
                                        ignore_labels=-1)()
        print('ROCAUC Evaluation result: ', rocauc_result)
        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)
    else:
        print('[WARNING] unknown task_type {}.'.format(task_type))

    # Save the evaluation results.
    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)
示例#30
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    # Apply a preprocessor to the dataset.
    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (labels, )))
    else:
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, _ = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(args.method, args.unit_num, args.conv_layers,
                                 class_num)

    # Set up the iterator.
    train_iter = SerialIterator(train, args.batchsize)

    # Set up the regressor.
    metrics_fun = {
        'mean_abs_error': MeanAbsError(scaler=scaler),
        'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)
    }
    regressor = Regressor(predictor,
                          lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun,
                          device=args.gpu)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    print('Training...')
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/mean_abs_error',
            'main/root_mean_sqr_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)

    # Save the standard scaler's parameters.
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=args.protocol)
示例#31
0
def main():
    # Supported preprocessing/network list
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    parser = argparse.ArgumentParser(
        description='Regression with own dataset.')
    parser.add_argument('--datafile', type=str, default='dataset.csv')
    parser.add_argument('--method',
                        '-m',
                        type=str,
                        choices=method_list,
                        default='nfp')
    parser.add_argument('--label',
                        '-l',
                        nargs='+',
                        default=['value1', 'value2'],
                        help='target label for regression')
    parser.add_argument('--scale',
                        type=str,
                        choices=scale_list,
                        default='standardize',
                        help='Label scaling method')
    parser.add_argument('--conv-layers', '-c', type=int, default=4)
    parser.add_argument('--batchsize', '-b', type=int, default=32)
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--out', '-o', type=str, default='result')
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--unit-num', '-u', type=int, default=16)
    parser.add_argument('--seed', '-s', type=int, default=777)
    parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
    parser.add_argument('--protocol', type=int, default=2)
    args = parser.parse_args()

    seed = args.seed
    train_data_ratio = args.train_data_ratio
    method = args.method
    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        sys.exit("Error: No target label is specified.")

    # Dataset preparation
    # Postprocess is required for regression task
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[method]()
    parser = CSVFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels,
                           smiles_col='SMILES')
    dataset = parser.parse(args.datafile)["dataset"]

    if args.scale == 'standardize':
        # Standard Scaler for labels
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (labels, )))
    else:
        # Not use scaler
        scaler = None

    train_data_size = int(len(dataset) * train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, seed)

    # Network
    n_unit = args.unit_num
    conv_layers = args.conv_layers
    if method == 'nfp':
        print('Train NFP model...')
        model = GraphConvPredictor(
            NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'ggnn':
        print('Train GGNN model...')
        model = GraphConvPredictor(
            GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'schnet':
        print('Train SchNet model...')
        model = GraphConvPredictor(
            SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers),
            None)
    elif method == 'weavenet':
        print('Train WeaveNet model...')
        n_atom = 20
        n_sub_layer = 1
        weave_channels = [50] * conv_layers
        model = GraphConvPredictor(
            WeaveNet(weave_channels=weave_channels,
                     hidden_dim=n_unit,
                     n_sub_layer=n_sub_layer,
                     n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit))
    elif method == 'rsgcn':
        print('Train RSGCN model...')
        model = GraphConvPredictor(
            RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers),
            MLP(out_dim=class_num, hidden_dim=n_unit))
    else:
        raise ValueError('[ERROR] Invalid method {}'.format(method))

    train_iter = iterators.SerialIterator(train, args.batchsize)
    val_iter = iterators.SerialIterator(val,
                                        args.batchsize,
                                        repeat=False,
                                        shuffle=False)

    regressor = Regressor(
        model,
        lossfun=F.mean_squared_error,
        metrics_fun={'abs_error': ScaledAbsError(scaler=scaler)},
        device=args.gpu)

    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(val_iter,
                    regressor,
                    device=args.gpu,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    # Note that original scale absolute errors are reported in
    # (validation/)main/abs_error
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/abs_error', 'validation/main/loss',
            'validation/main/abs_error', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # --- save regressor's parameters ---
    protocol = args.protocol
    model_path = os.path.join(args.out, 'model.npz')
    print('saving trained model to {}'.format(model_path))
    serializers.save_npz(model_path, regressor)
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=protocol)

    # Example of prediction using trained model
    smiles = 'c1ccccc1'
    mol = Chem.MolFromSmiles(smiles)
    preprocessor = preprocess_method_dict[method]()
    standardized_smiles, mol = preprocessor.prepare_smiles_and_mol(mol)
    input_features = preprocessor.get_input_features(mol)
    atoms, adjs = concat_mols([input_features], device=args.gpu)
    prediction = model(atoms, adjs).data[0]
    if scaler is not None:
        prediction = scaler.inverse_transform(prediction)
    print('Prediction for {}:'.format(smiles))
    for i, label in enumerate(args.label):
        print('{}: {}'.format(label, prediction[i]))
示例#32
0
def main(method, labels, unit_num, conv_layers, class_num, n_layers,
         dropout_ratio, model_path, save_path):
    # Dataset preparation
    train, val, test, train_smiles, val_smiles, test_smiles = data.load_dataset(method, labels)

    # --- model preparation ---
    model = predictor.build_predictor(
        method, unit_num, conv_layers, class_num, dropout_ratio, n_layers)

    classifier = L.Classifier(model,
                              lossfun=F.sigmoid_cross_entropy,
                              accfun=F.binary_accuracy)

    print('Loading model parameter from ', model_path)
    serializers.load_npz(model_path, model)

    target_dataset = val
    target_smiles = val_smiles

    val_mols = [Chem.MolFromSmiles(smi) for smi in tqdm(val_smiles)]

    pyridine_mol = Chem.MolFromSmarts(PYRIDINE_SMILES)
    pyridine_index = np.where(np.array([mol.HasSubstructMatch(pyridine_mol) for mol in val_mols]) == True)
    val_pyridine_mols = np.array(val_mols)[pyridine_index]

    # It only extracts one substructure, not expected behavior
    # val_pyridine_pos = [set(mol.GetSubstructMatch(pi)) for mol in val_pyridine_mols]
    def flatten_tuple(x):
        return [element for tupl in x for element in tupl]

    val_pyridine_pos = [flatten_tuple(mol.GetSubstructMatches(pyridine_mol)) for mol in val_pyridine_mols]

    # print('pyridine_index', pyridine_index)
    # print('val_pyridine_mols', val_pyridine_mols.shape)
    # print('val_pyridine_pos', val_pyridine_pos)
    # print('val_pyridine_pos length', [len(k) for k in val_pyridine_pos])

    pyrigine_dataset = NumpyTupleDataset(*target_dataset.features[pyridine_index, :])
    pyrigine_smiles = target_smiles[pyridine_index]
    print('pyrigine_dataset', len(pyrigine_dataset), len(pyrigine_smiles))

    atoms = pyrigine_dataset.features[:, 0]
    num_atoms = [len(a) for a in atoms]

    def clip_original_size(saliency, num_atoms):
        """`saliency` array is 0 padded, this method align to have original
        molecule's length
        """
        assert len(saliency) == len(num_atoms)
        saliency_list = []
        for i in range(len(saliency)):
            saliency_list.append(saliency[i, :num_atoms[i]])
        return saliency_list

    def preprocess_fun(*inputs):
        atom, adj, t = inputs
        # HACKING for now...
        atom_embed = classifier.predictor.graph_conv.embed(atom)
        return atom_embed, adj, t

    def eval_fun(*inputs):
        atom_embed, adj, t = inputs
        prob = classifier.predictor(atom_embed, adj)
        out = F.sum(prob)
        return out

    calculator_method = args.calculator
    print('calculator method', calculator_method)
    if calculator_method == 'gradient':
        # option1: Gradient
        calculator = GradientCalculator(
            classifier, eval_fun=eval_fun,
            # target_key='embed', eval_key='out',
            target_key=0,
            # multiply_target=True  # this will calculate grad * input
        )
    elif calculator_method == 'integrated_gradients':
        # option2: IntegratedGradients
        calculator = IntegratedGradientsCalculator(
            classifier, eval_fun=eval_fun,
            # target_key='embed', eval_key='out',
            target_key=0, steps=10
        )
    elif calculator_method == 'occlusion':
        # option3: Occlusion
        def eval_fun_occlusion(*inputs):
            atom_embed, adj, t = inputs
            prob = classifier.predictor(atom_embed, adj)
            # Do not take sum, instead return batch-wise score
            out = F.sigmoid(prob)
            return out
        calculator = OcclusionCalculator(
            classifier, eval_fun=eval_fun_occlusion,
            # target_key='embed', eval_key='out',
            target_key=0, slide_axis=1
        )
    else:
        raise ValueError("[ERROR] Unexpected value calculator_method={}".format(calculator_method))

    M = 100
    num = 20
    rates = np.linspace(0.1, 1, num=num)
    print('M', M)

    # --- VanillaGrad ---
    saliency_arrays = calculator.compute_vanilla(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun)
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square')
    # saliency_arrays -> M, batch_size, max_atom, ch_dim
    # print('saliency_arrays', saliency_arrays.shape)
    # saliency -> batch_size, max_atom
    # print('saliency', saliency.shape)
    saliency_vanilla = clip_original_size(saliency, num_atoms)

    # recall & precision
    vanilla_recall, vanilla_precision = calc_recall_precision(saliency_vanilla, rates, val_pyridine_pos)
    print('vanilla_recall', vanilla_recall)
    print('vanilla_precision', vanilla_precision)

    # --- SmoothGrad ---
    saliency_arrays = calculator.compute_smooth(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun,
        M=M,
        mode='absolute', scale=0.15  # previous implementation
        # mode='relative', scale=0.05
    )
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square')

    saliency_smooth = clip_original_size(saliency, num_atoms)

    # recall & precision
    smooth_recall, smooth_precision = calc_recall_precision(saliency_smooth, rates, val_pyridine_pos)
    print('smooth_recall', smooth_recall)
    print('smooth_precision', smooth_precision)

    # --- BayesGrad ---
    # bayes grad is calculated by compute_vanilla with train=True
    saliency_arrays = calculator.compute_vanilla(
        pyrigine_dataset, converter=concat_mols, preprocess_fn=preprocess_fun,
        M=M, train=True)
    saliency = calculator.transform(
        saliency_arrays, ch_axis=3, method='square', lam=0)
    saliency_bayes = clip_original_size(saliency, num_atoms)

    bayes_recall, bayes_precision = calc_recall_precision(saliency_bayes, rates, val_pyridine_pos)
    print('bayes_recall', bayes_recall)
    print('bayes_precision', bayes_precision)

    plt.figure(figsize=(7, 5), dpi=200)
    plt.plot(vanilla_recall, vanilla_precision, 'k-', color='blue', label='VanillaGrad')
    plt.plot(smooth_recall, smooth_precision, 'k-', color='green', label='SmoothGrad')
    plt.plot(bayes_recall, bayes_precision, 'k-', color='red', label='BayesGrad(Ours)')
    plt.axhline(y=vanilla_precision[-1], color='gray', linestyle='--')
    plt.legend()
    plt.xlabel("recall")
    plt.ylabel("precision")
    if save_path:
        print('saved to ', save_path)
        plt.savefig(save_path)
        # plt.savefig('artificial_pr.eps')
    else:
        plt.show()
示例#33
0
 def test_get_datasets(self, data):
     dataset = NumpyTupleDataset(*data)
     datasets = dataset.get_datasets()
     assert len(datasets) == len(data)
     for i in range(len(datasets)):
         numpy.testing.assert_array_equal(datasets[i], data[i])
示例#34
0
def main():
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    dataset_name = args.dataset
    method = args.method
    num_data = args.num_data
    n_unit = args.unit_num
    conv_layers = args.conv_layers

    task_type = molnet_default_config[dataset_name]['task_type']
    model_filename = {'classification': 'classifier.pkl',
                      'regression': 'regressor.pkl'}

    print('Using dataset: {}...'.format(dataset_name))

    # Set up some useful variables that will be used later on.
    if args.label:
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,
                                                            method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,
                                                             method))
        class_num = len(molnet_default_config[args.dataset]['tasks'])

    # Load the train and validation parts of the dataset.
    filenames = [dataset_part_filename(p, num_data)
                 for p in ['train', 'valid']]

    paths = [os.path.join(cache_dir, f) for f in filenames]
    if all([os.path.exists(path) for path in paths]):
        dataset_parts = []
        for path in paths:
            print('Loading cached dataset from {}.'.format(path))
            dataset_parts.append(NumpyTupleDataset.load(path))
    else:
        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,
                                                method, cache_dir)
    train, valid = dataset_parts[0], dataset_parts[1]

#    # Scale the label values, if necessary.
#    if args.scale == 'standardize':
#        if task_type == 'regression':
#            print('Applying standard scaling to the labels.')
#            datasets, scaler = standardize_dataset_labels(datasets)
#        else:
#            print('Label scaling is not available for classification tasks.')
#    else:
#        print('No label scaling was selected.')
#        scaler = None

    # Set up the predictor.
    predictor = set_up_predictor(method, n_unit, conv_layers, class_num)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Load metrics for the current dataset.
    metrics = molnet_default_config[dataset_name]['metrics']
    metrics_fun = {k: v for k, v in metrics.items()
                   if isinstance(v, types.FunctionType)}
    loss_fun = molnet_default_config[dataset_name]['loss']

    if task_type == 'regression':
        model = Regressor(predictor, lossfun=loss_fun,
                          metrics_fun=metrics_fun, device=args.gpu)
        # TODO: Use standard scaler for regression task
    elif task_type == 'classification':
        model = Classifier(predictor, lossfun=loss_fun,
                           metrics_fun=metrics_fun, device=args.gpu)
    else:
        raise ValueError('Invalid task type ({}) encountered when processing '
                         'dataset ({}).'.format(task_type, dataset_name))

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Save model-related output to this directory.
    model_dir = os.path.join(args.out, os.path.basename(cache_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)
    trainer.extend(E.Evaluator(valid_iter, model, device=args.gpu,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())

    # Report various metrics.
    print_report_targets = ['epoch', 'main/loss', 'validation/main/loss']
    for metric_name, metric_fun in metrics.items():
        if isinstance(metric_fun, types.FunctionType):
            print_report_targets.append('main/' + metric_name)
            print_report_targets.append('validation/main/' + metric_name)
        elif issubclass(metric_fun, BatchEvaluator):
            trainer.extend(metric_fun(valid_iter, model, device=args.gpu,
                                      eval_func=predictor,
                                      converter=concat_mols, name='val',
                                      raise_value_error=False))
            print_report_targets.append('val/main/' + metric_name)
        else:
            raise TypeError('{} is not a supported metrics function.'
                            .format(type(metrics_fun)))
    print_report_targets.append('elapsed_time')

    # Augmented by Ishiguro
    # ToDo: consider go/no-go of the following block
    # (i) more reporting for val/evalutaion
    # (ii) best validation score snapshot
    if task_type == 'regression':
        if 'RMSE' in metric_name:
            trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))
        elif 'MAE' in metric_name:
            trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MinValueTrigger('validation/main/MAE'))
        else:
            print("No validation metric defined?")
            assert(False)

    elif task_type == 'classification':
        train_eval_iter = iterators.SerialIterator(train, args.batchsize,repeat=False, shuffle=False)
        trainer.extend(ROCAUCEvaluator(
            train_eval_iter, predictor, eval_func=predictor,
            device=args.gpu, converter=concat_mols, name='train',
            pos_labels=1, ignore_labels=-1, raise_value_error=False))
        # extension name='validation' is already used by `Evaluator`,
        # instead extension name `val` is used.
        trainer.extend(ROCAUCEvaluator(
            valid_iter, predictor, eval_func=predictor,
            device=args.gpu, converter=concat_mols, name='val',
            pos_labels=1, ignore_labels=-1))
        print_report_targets.append('train/main/roc_auc')
        print_report_targets.append('validation/main/loss')
        print_report_targets.append('val/main/roc_auc')

        trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))
    else:
        raise NotImplementedError(
            'Not implemented task_type = {}'.format(task_type))


    trainer.extend(E.PrintReport(print_report_targets))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the model's parameters.
    model_path = os.path.join(model_dir,  model_filename[task_type])
    print('Saving the trained model to {}...'.format(model_path))
    model.save_pickle(model_path, protocol=args.protocol)
示例#35
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor, labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Applying standard scaling to the labels.')
        scaler = StandardScaler()
        scaled_t = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1]
                                      + (scaled_t,)))
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,
                                          shuffle=False)

    # Set up the regressor.
    device = args.gpu
    metrics_fun = {'mae': MeanAbsError(scaler=scaler),
                   'rmse': RootMeanSqrError(scaler=scaler)}
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=device)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter, optimizer, device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(E.Evaluator(valid_iter, regressor, device=device,
                               converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.PrintReport([
        'epoch', 'main/loss', 'main/mae', 'main/rmse', 'validation/main/loss',
        'validation/main/mae', 'validation/main/rmse', 'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)